In [1]:
import beepy

In [1]:
import tensorflow as tf 
import os 
import PIL
import pandas as pd 
import numpy as np 
import pathlib
import matplotlib.pyplot as plt 
import cv2
import warnings
import config
import re

In [2]:
data_dir = pathlib.Path(config.CHEXPERT_DATA_PATH)
os.listdir(data_dir)

['chexpert_train_image_paths.txt',
 'chexpert_valid_image_paths.txt',
 'train',
 'train.csv',
 'valid',
 'valid.csv']

In [3]:
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
valid_df = pd.read_csv(os.path.join(data_dir, 'valid.csv'))

In [4]:
train_df.head()

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXpert-v1.0-small/train/patient00001/study1/...,Female,68,Frontal,AP,1.0,,,,,,,,,0.0,,,,1.0
1,CheXpert-v1.0-small/train/patient00002/study2/...,Female,87,Frontal,AP,,,-1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,
2,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Frontal,AP,,,,1.0,,,-1.0,,,,,,1.0,
3,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Lateral,,,,,1.0,,,-1.0,,,,,,1.0,
4,CheXpert-v1.0-small/train/patient00003/study1/...,Male,41,Frontal,AP,,,,,,1.0,,,,0.0,,,,


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223414 entries, 0 to 223413
Data columns (total 19 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Path                        223414 non-null  object 
 1   Sex                         223414 non-null  object 
 2   Age                         223414 non-null  int64  
 3   Frontal/Lateral             223414 non-null  object 
 4   AP/PA                       191027 non-null  object 
 5   No Finding                  22381 non-null   float64
 6   Enlarged Cardiomediastinum  44839 non-null   float64
 7   Cardiomegaly                46203 non-null   float64
 8   Lung Opacity                117778 non-null  float64
 9   Lung Lesion                 11944 non-null   float64
 10  Edema                       85956 non-null   float64
 11  Consolidation               70622 non-null   float64
 12  Pneumonia                   27608 non-null   float64
 13  Atelectasis   

In [6]:
valid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Path                        234 non-null    object 
 1   Sex                         234 non-null    object 
 2   Age                         234 non-null    int64  
 3   Frontal/Lateral             234 non-null    object 
 4   AP/PA                       202 non-null    object 
 5   No Finding                  234 non-null    float64
 6   Enlarged Cardiomediastinum  234 non-null    float64
 7   Cardiomegaly                234 non-null    float64
 8   Lung Opacity                234 non-null    float64
 9   Lung Lesion                 234 non-null    float64
 10  Edema                       234 non-null    float64
 11  Consolidation               234 non-null    float64
 12  Pneumonia                   234 non-null    float64
 13  Atelectasis                 234 non

In [7]:
train_df.iloc[:5,0]

0    CheXpert-v1.0-small/train/patient00001/study1/...
1    CheXpert-v1.0-small/train/patient00002/study2/...
2    CheXpert-v1.0-small/train/patient00002/study1/...
3    CheXpert-v1.0-small/train/patient00002/study1/...
4    CheXpert-v1.0-small/train/patient00003/study1/...
Name: Path, dtype: object

#### Replacing NaN values for the 14 labels

In [7]:
# Replacing all the NaN values in train and validation dataframes with 0.0 for the 14 labels
label_columns = list(train_df.iloc[:5, 5:].columns)
label_columns

['No Finding',
 'Enlarged Cardiomediastinum',
 'Cardiomegaly',
 'Lung Opacity',
 'Lung Lesion',
 'Edema',
 'Consolidation',
 'Pneumonia',
 'Atelectasis',
 'Pneumothorax',
 'Pleural Effusion',
 'Pleural Other',
 'Fracture',
 'Support Devices']

In [8]:
train_df.loc[:, label_columns] = train_df.loc[:, label_columns].replace(to_replace = np.nan, value = 0.0)

valid_df.loc[:, label_columns] = valid_df.loc[:, label_columns].replace(to_replace = np.nan, value= 0.0)

In [9]:
# Replacing uncertain labels with positive labels
def uncertain_to_pos(df):
    df.replace(to_replace = -1.0, value = 1.0, inplace = True)

uncertain_to_pos(train_df)
uncertain_to_pos(valid_df)

In [10]:
train_df.set_index('Path', inplace = True)
valid_df.set_index('Path', inplace = True)


### Creating a data pipeline

In [11]:
with open('chexpert_train_image_paths.txt', 'r') as file:
    train_image_paths = file.readline()
with open('chexpert_valid_image_paths.txt', 'r') as file:
    valid_image_paths = file.readline()

In [12]:
import ast
train_image_paths = ast.literal_eval(train_image_paths)
valid_image_paths = ast.literal_eval(valid_image_paths)

In [13]:
def modify_path(path):
    parts = path.split('/')[4:]
    new_path = os.path.join(data_dir, os.path.join(*parts))
    return new_path

In [14]:
from joblib import Parallel, delayed

In [15]:
train_image_paths = Parallel(n_jobs= 10, backend = 'threading')(delayed(modify_path)(path) for path in train_image_paths)

In [16]:
valid_image_paths = Parallel(n_jobs= 10, backend = 'threading')(delayed(modify_path)(path) for path in valid_image_paths)

In [17]:
train_image_paths[:5]

['D:\\Datasets\\CheXpert-v1.0-small\\train\\patient00734\\study3\\view2_lateral.jpg',
 'D:\\Datasets\\CheXpert-v1.0-small\\train\\patient00734\\study3\\view1_frontal.jpg',
 'D:\\Datasets\\CheXpert-v1.0-small\\train\\patient00734\\study2\\view1_frontal.jpg',
 'D:\\Datasets\\CheXpert-v1.0-small\\train\\patient00734\\study1\\view1_frontal.jpg',
 'D:\\Datasets\\CheXpert-v1.0-small\\train\\patient28598\\study3\\view1_frontal.jpg']

In [18]:
train_image_paths.remove('D:\\Datasets\\CheXpert-v1.0-small\\train\\patient00001\\study1\\._view1_frontal.jpg')

In [19]:
import random
random.shuffle(train_image_paths)

In [20]:
train_ds = tf.data.Dataset.from_tensor_slices(train_image_paths)
test_ds = tf.data.Dataset.from_tensor_slices(valid_image_paths)

In [21]:
image_count = tf.data.experimental.cardinality(train_ds).numpy()

In [22]:
val_size = int(image_count * 0.5)
train_final_ds = train_ds.skip(val_size)
val_ds = train_ds.take(val_size)

In [23]:
main_df = pd.concat([train_df, valid_df], axis = 0)

In [24]:
input_height, input_width = (512,512)
def process_path(file_path):
    parts = list(tf.strings.split(file_path, os.path.sep).numpy()[-5:])
    parts = [i.decode() for i in parts]
    min_path = '/'.join(parts)
    label = tf.cast(list(main_df.loc[min_path,label_columns]), dtype = tf.int16)
    
    # Loading the image
    img = tf.io.read_file(file_path)
    img = tf.io.decode_jpeg(img, channels = 1)
    # Resizing the image
    img = tf.image.resize(img, [input_height,input_width])
    img =  tf.image.grayscale_to_rgb(img)
#     img = tf.cast(img, dtype = tf.int16)
    img = tf.keras.applications.mobilenet.preprocess_input(img)
    return img, label

In [25]:
train_final_ds = train_final_ds.map(lambda x: tf.py_function(func = process_path,inp = [x], Tout = (tf.float32, tf.int16)), num_parallel_calls = tf.data.AUTOTUNE)
valid_ds = val_ds.map(lambda x: tf.py_function(func = process_path,inp = [x], Tout = (tf.float32, tf.int16)), num_parallel_calls = tf.data.AUTOTUNE)
test_ds = test_ds.map(lambda x: tf.py_function(func = process_path,inp = [x], Tout = (tf.float32, tf.int16)), num_parallel_calls = tf.data.AUTOTUNE)

In [26]:
batch_size = 4
train = train_final_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
valid = valid_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

test = test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

### EfficientNetV2 - S

In [27]:
import sys
effnet_path = os.path.join(os.getcwd(), 'automl', 'efficientnetv2')
sys.path.append(effnet_path)
import effnetv2_model

In [28]:
learning_rate_scheduler = tf.keras.optimizers.schedules.CosineDecayRestarts(initial_learning_rate=0.0001, 
                                                  first_decay_steps=1000,
                                                  alpha = 0.02,
                                                  m_mul=0.9, 
                                                  t_mul = 2  
                                                 )

In [29]:
tf.keras.backend.clear_session()
effnetv2_s = tf.keras.models.Sequential([
    tf.keras.layers.InputLayer(input_shape=[input_height,input_width, 3]),
    effnetv2_model.get_model('efficientnetv2-s', include_top=False, pretrained=False),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Dense(14, activation='sigmoid'),
])
effnetv2_s.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
efficientnetv2-s (EffNetV2Mo (None, 1280)              20331360  
_________________________________________________________________
dropout_1 (Dropout)          (None, 1280)              0         
_________________________________________________________________
dense (Dense)                (None, 14)                17934     
Total params: 20,349,294
Trainable params: 20,195,422
Non-trainable params: 153,872
_________________________________________________________________


In [30]:
optimizer=tf.keras.optimizers.Adam(learning_rate_scheduler,)

effnetv2_s.compile(optimizer=optimizer, loss = tf.keras.losses.CategoricalCrossentropy(), metrics = [tf.keras.metrics.AUC()])

In [31]:
checkpoint_path = './checkpoints/train/efficientNetV2-s/512px'

In [32]:
early_stopping = tf.keras.callbacks.EarlyStopping(min_delta = 0.0001, patience = 2)
#reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(factor = 0.1, patience = 1)
checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_best_only = True)
callbacks = [early_stopping, checkpoint]
checkpoint._supports_tf_logs = False


In [1]:
effnetv2_s.fit(train,validation_data=test, epochs = 10, callbacks=callbacks, verbose = 1)

NameError: name 'effnetv2_s' is not defined

In [None]:
#effnetv2_s.save(r'C:\Users\prans\Python files\Kaggle Competitions\Covid_19_object_detection\saved_models\efficientNetV2-s')