<a href="https://colab.research.google.com/github/rishikakapoor/Malaria_dataset/blob/master/Resnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
! git clone https://github.com/rishikakapoor/Malaria_dataset.git

Cloning into 'Malaria_dataset'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 27569 (delta 0), reused 0 (delta 0), pack-reused 27566[K
Receiving objects: 100% (27569/27569), 331.94 MiB | 8.22 MiB/s, done.
Checking out files: 100% (27560/27560), done.


In [0]:
import os
import glob

base_dir = os.path.join('/content/Malaria_dataset/Malaria_dataset')
infected_dir = os.path.join(base_dir,'Parasitized')
healthy_dir = os.path.join(base_dir,'Uninfected')

infected_files = glob.glob(infected_dir+'/*.png')
healthy_files = glob.glob(healthy_dir+'/*.png')
len(infected_files), len(healthy_files)

(13778, 13779)

In [0]:
import numpy as np
import pandas as pd

np.random.seed(42)

files_df = pd.DataFrame({
    'filename': infected_files + healthy_files,
    'label': ['malaria'] * len(infected_files) + ['healthy'] * len(healthy_files)
}).sample(frac=1, random_state=42).reset_index(drop=True)

files_df.head()

Unnamed: 0,filename,label
0,/content/Malaria_dataset/Malaria_dataset/Paras...,malaria
1,/content/Malaria_dataset/Malaria_dataset/Uninf...,healthy
2,/content/Malaria_dataset/Malaria_dataset/Uninf...,healthy
3,/content/Malaria_dataset/Malaria_dataset/Uninf...,healthy
4,/content/Malaria_dataset/Malaria_dataset/Uninf...,healthy


In [0]:
from sklearn.model_selection import train_test_split
from collections import Counter

train_files, test_files, train_labels, test_labels = train_test_split(files_df['filename'].values,
                                                                      files_df['label'].values, 
                                                                      test_size=0.3, random_state=42)
train_files, val_files, train_labels, val_labels = train_test_split(train_files,
                                                                    train_labels, 
                                                                    test_size=0.1, random_state=42)

print(train_files.shape, val_files.shape, test_files.shape)
print('Train:', Counter(train_labels), '\nVal:', Counter(val_labels), '\nTest:', Counter(test_labels))

(17360,) (1929,) (8268,)
Train: Counter({'healthy': 8707, 'malaria': 8653}) 
Val: Counter({'malaria': 1001, 'healthy': 928}) 
Test: Counter({'healthy': 4144, 'malaria': 4124})


In [0]:
import cv2
from concurrent import futures
import threading

def get_img_shape_parallel(idx, img, total_imgs):
    if idx % 5000 == 0 or idx == (total_imgs - 1):
        print('{}: working on img num: {}'.format(threading.current_thread().name,
                                                  idx))
    return cv2.imread(img).shape
  
ex = futures.ThreadPoolExecutor(max_workers=None)
data_inp = [(idx, img, len(train_files)) for idx, img in enumerate(train_files)]
print('Starting Img shape computation:')
train_img_dims_map = ex.map(get_img_shape_parallel, 
                            [record[0] for record in data_inp],
                            [record[1] for record in data_inp],
                            [record[2] for record in data_inp])
train_img_dims = list(train_img_dims_map)
print('Min Dimensions:', np.min(train_img_dims, axis=0)) 
print('Avg Dimensions:', np.mean(train_img_dims, axis=0))
print('Median Dimensions:', np.median(train_img_dims, axis=0))
print('Max Dimensions:', np.max(train_img_dims, axis=0))

Starting Img shape computation:
ThreadPoolExecutor-0_0: working on img num: 0
ThreadPoolExecutor-0_9: working on img num: 5000
ThreadPoolExecutor-0_0: working on img num: 10000
ThreadPoolExecutor-0_2: working on img num: 15000
ThreadPoolExecutor-0_3: working on img num: 17359
Min Dimensions: [46 58  3]
Avg Dimensions: [132.88698157 132.43836406   3.        ]
Median Dimensions: [130. 130.   3.]
Max Dimensions: [382 364   3]


In [0]:
IMG_DIMS = (125, 125)

def get_img_data_parallel(idx, img, total_imgs):
    if idx % 5000 == 0 or idx == (total_imgs - 1):
        print('{}: working on img num: {}'.format(threading.current_thread().name,
                                                  idx))
    img = cv2.imread(img)
    img = cv2.resize(img, dsize=IMG_DIMS, 
                     interpolation=cv2.INTER_CUBIC)
    img = np.array(img, dtype=np.float32)
    return img

ex = futures.ThreadPoolExecutor(max_workers=None)
train_data_inp = [(idx, img, len(train_files)) for idx, img in enumerate(train_files)]
val_data_inp = [(idx, img, len(val_files)) for idx, img in enumerate(val_files)]
test_data_inp = [(idx, img, len(test_files)) for idx, img in enumerate(test_files)]

print('Loading Train Images:')
train_data_map = ex.map(get_img_data_parallel, 
                        [record[0] for record in train_data_inp],
                        [record[1] for record in train_data_inp],
                        [record[2] for record in train_data_inp])
train_data = np.array(list(train_data_map))

print('\nLoading Validation Images:')
val_data_map = ex.map(get_img_data_parallel, 
                        [record[0] for record in val_data_inp],
                        [record[1] for record in val_data_inp],
                        [record[2] for record in val_data_inp])
val_data = np.array(list(val_data_map))

print('\nLoading Test Images:')
test_data_map = ex.map(get_img_data_parallel, 
                        [record[0] for record in test_data_inp],
                        [record[1] for record in test_data_inp],
                        [record[2] for record in test_data_inp])
test_data = np.array(list(test_data_map))

train_data.shape, val_data.shape, test_data.shape 

Loading Train Images:ThreadPoolExecutor-1_0: working on img num: 0

ThreadPoolExecutor-1_3: working on img num: 5000
ThreadPoolExecutor-1_2: working on img num: 10000
ThreadPoolExecutor-1_2: working on img num: 15000
ThreadPoolExecutor-1_4: working on img num: 17359

Loading Validation Images:
ThreadPoolExecutor-1_9: working on img num: 0
ThreadPoolExecutor-1_6: working on img num: 1928

Loading Test Images:
ThreadPoolExecutor-1_5: working on img num: 0
ThreadPoolExecutor-1_3: working on img num: 5000
ThreadPoolExecutor-1_4: working on img num: 8267


((17360, 125, 125, 3), (1929, 125, 125, 3), (8268, 125, 125, 3))

In [0]:
BATCH_SIZE = 64
NUM_CLASSES = 2
EPOCHS = 25
INPUT_SHAPE = (125, 125, 3)

train_imgs_scaled = train_data / 255.
val_imgs_scaled = val_data / 255.

# encode text category labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(train_labels)
train_labels_enc = le.transform(train_labels)
val_labels_enc = le.transform(val_labels)

print(train_labels[:6], train_labels_enc[:6])

['malaria' 'healthy' 'malaria' 'malaria' 'healthy' 'healthy'] [1 0 1 1 0 0]


In [0]:
# # vgg = tf.keras.applications.vgg19.VGG19(include_top=False, weights='imagenet', 
# #                                         input_shape=INPUT_SHAPE)
# resnet=tf.keras.applications.resnet.ResNet50(include_top=False, weights='imagenet', input_shape=INPUT_SHAPE)

# resnet.trainable = False
# # Freeze the layers
# for layer in resnet.layers:
#     layer.trainable = False
    
# base_resnet = resnet
# base_out = base_resnet.output
# pool_out = tf.keras.layers.Flatten()(base_out)
# hidden1 = tf.keras.layers.Dense(512, activation='relu')(pool_out)
# drop1 = tf.keras.layers.Dropout(rate=0.3)(hidden1)
# hidden2 = tf.keras.layers.Dense(512, activation='relu')(drop1)
# drop2 = tf.keras.layers.Dropout(rate=0.3)(hidden2)

# # Adding a Global Average Pooling layer
# x = GlobalAveragePooling2D()(x)

# # Adding a fully connected layer having 1024 neurons
# x = Dense(1024, activation='relu')(x)

# # Adding a fully connected layer having 2 neurons which will
# # give the probability of image having either dog or cat
# predictions = Dense(2, activation='softmax')(x)

# out = tf.keras.layers.Dense(1, activation='sigmoid')(drop2)

# model = tf.keras.Model(inputs=base_vgg.input, outputs=out)
# model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=1e-4),
#                 loss='binary_crossentropy',
#                 metrics=['accuracy'])
# model.summary()

from tensorflow.python.keras.applications import ResNet50
from tensorflow.python.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.optimizers import SGD
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator

# Download the architecture of ResNet50 with ImageNet weights
base_model = ResNet50(include_top=False, weights='imagenet')

# Taking the output of the last convolution block in ResNet50
x = base_model.output

# Adding a Global Average Pooling layer
x = GlobalAveragePooling2D()(x)

# Adding a fully connected layer having 1024 neurons
x = Dense(1024, activation='relu')(x)

# Adding a fully connected layer having 2 neurons which will
# give the probability of image having either dog or cat
predictions = Dense(1, activation='sigmoid')(x)

# Model to be trained
model = Model(inputs=base_model.input, outputs=predictions)
# model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=1e-4),
#                 loss='binary_crossentropy',
#                 metrics=['accuracy'])
model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='binary_crossentropy', metrics = ['accuracy'])
model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Downloading data from https://github.com/keras-team/keras-applications/releases/download/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, None,  0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, None, None, 3 0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1_conv (Conv2D)             (None, None, None, 6 9472        co

In [0]:
import tensorflow as tf
import datetime

logdir = os.path.join('/home/dipanzan_sarkar/projects/tensorboard_logs', 
                      datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                              patience=2, min_lr=0.000001)

#early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, 
#                                              mode='auto', baseline=None, restore_best_weights=False)
callbacks = [reduce_lr, tensorboard_callback]

history = model.fit(x=train_imgs_scaled, y=train_labels_enc, 
                    batch_size=BATCH_SIZE,
                    epochs=5, 
                    validation_data=(val_imgs_scaled, val_labels_enc), 
                    callbacks=callbacks,
                    verbose=1)

Train on 17360 samples, validate on 1929 samples
Epoch 1/5

In [0]:
 pip install -U git+https://github.com/keras-team/keras git+https://github.com/keras-team/keras-applications


In [0]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
t = f.suptitle('Basic CNN Performance', fontsize=12)
f.subplots_adjust(top=0.85, wspace=0.3)

max_epoch = len(history.history['accuracy'])+1
epoch_list = list(range(1,max_epoch))
ax1.plot(epoch_list, history.history['accuracy'], label='Train Accuracy')
ax1.plot(epoch_list, history.history['val_accuracy'], label='Validation Accuracy')
ax1.set_xticks(np.arange(1, max_epoch, 5))
ax1.set_ylabel('Accuracy Value')
ax1.set_xlabel('Epoch')
ax1.set_title('Accuracy')
l1 = ax1.legend(loc="best")

ax2.plot(epoch_list, history.history['loss'], label='Train Loss')
ax2.plot(epoch_list, history.history['val_loss'], label='Validation Loss')
ax2.set_xticks(np.arange(1, max_epoch, 5))
ax2.set_ylabel('Loss Value')
ax2.set_xlabel('Epoch')
ax2.set_title('Loss')
l2 = ax2.legend(loc="best")
model.save('basic_cnn.h5')

Collecting resnet
  Downloading https://files.pythonhosted.org/packages/c2/5b/1a89d31126c50cea7b29db3772a00862fa72b54f0970032766c914091ee0/resnet-0.1.tar.gz
Building wheels for collected packages: resnet
  Building wheel for resnet (setup.py) ... [?25l[?25hdone
  Created wheel for resnet: filename=resnet-0.1-cp36-none-any.whl size=10044 sha256=6e2ef69bd93e087d72ebadb31197ffb8007d90580a6b16d010e24b98cd93f72a
  Stored in directory: /root/.cache/pip/wheels/f6/15/ce/e3fe2b3a504698765a1b2dfb05ed4a91fcc0f38c8e57568658
Successfully built resnet
Installing collected packages: resnet
Successfully installed resnet-0.1
