# Code to load all images along with labels since labels are given in csv file along with image path
## Link to CheXpert dataset: https://stanfordmlgroup.github.io/competitions/chexpert/

In [1]:
# Image dimension imdim x imdim
imdim = 128

In [2]:
# Loading Validation Images and Labels
import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
from PIL import Image

vlabel = pd.read_csv('/Users/santoshdaptardar/Documents/CheXpert-v1.0-small/valid.csv')
vlabel = vlabel[vlabel['Frontal/Lateral'].isin(['Frontal'])]
#print(vlabel)
docu_path = '/Users/santoshdaptardar/Documents'
vdata = []; vlabels = []
for (p,i) in zip(vlabel.iloc[:,0], range(vlabel['Path'].count())):
    p1 = p.split('/')
    image_name = p1.pop()
    p2 = '/'.join(p1)
    os.chdir(docu_path + '/' + p2)
    vv = np.array(Image.open(image_name).resize((imdim,imdim))).reshape(imdim,imdim)/255.0
    vdata.extend([vv[:,:,np.newaxis]])
    vlabels.extend([vlabel.iloc[i, 5:19].to_numpy()])
vdata = np.asarray(vdata)
vlabels = np.asarray(vlabels)
print(vdata.shape)
print(vlabels[100:105,])

(202, 224, 224, 1)
[[0.0 1.0 1.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0]
 [0.0 1.0 0.0 1.0 0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0]
 [0.0 1.0 1.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0]
 [0.0 1.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0]
 [0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 1.0]]


In [7]:
# Loading Training images and labels

import os, pandas as pd, numpy as np, matplotlib.pyplot as plt, pickle
from PIL import Image
from tqdm.notebook import tqdm

vlabel = pd.read_csv('/Users/santoshdaptardar/Documents/CheXpert-v1.0-small/train.csv')
vlabel = vlabel[vlabel['Frontal/Lateral'].isin(['Frontal'])]
vlabel = vlabel.fillna(0)
#print(vlabel)
docu_path = '/Users/santoshdaptardar/Documents'
data = []
labels = []
total =  50000 #vlabel['Path'].count() # no. of images to load
for (p,i) in zip(vlabel.iloc[:,0], tqdm(range(total))):
    #if (i+1)%2000 == 0: print(i+1,'/',total,'completed!')
    p1 = p.split('/')
    image_name = p1.pop()
    p2 = '/'.join(p1)
    os.chdir(docu_path + '/' + p2)
    dd = np.array(Image.open(image_name).resize((imdim,imdim))).reshape(imdim,imdim)/255.0
    data.extend([dd[:,:, np.newaxis]])
    l = vlabel.iloc[i, 5:19].astype(float).to_numpy()
    labels.extend([np.where(l==-1, 0, l)]) # replace -1 with 0
data = np.asarray(data)
labels = np.asarray(labels)
print(labels[:10,])
print(data.shape)

HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))


[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1.]]
(50000, 224, 224, 1)


In [1]:
# Pickle train and test data and labels into a file for easy loading
import bz2

os.chdir('/Users/santoshdaptardar/Documents/Deep Learning/Project')

with bz2.BZ2File('train_data_1_128.pbz2','w') as f:
    pickle.dump(data[:30000,:],f)
    
with bz2.BZ2File('train_data_2_128.pbz2','w') as f:
    pickle.dump(data[30000:60000,:],f)
    
with bz2.BZ2File('train_data_3_128.pbz2','w') as f:
    pickle.dump(data[60000:90000,:],f)

with bz2.BZ2File('train_data_4_128.pbz2','w') as f:
    pickle.dump(data[90000:120000,:],f)

with bz2.BZ2File('train_data_5_128.pbz2','w') as f:
    pickle.dump(data[120000:150000,:],f)

with bz2.BZ2File('train_data_6_128.pbz2','w') as f:
    pickle.dump(data[150000:180000,:],f)

with bz2.BZ2File('train_data_7_128.pbz2','w') as f:
    pickle.dump(data[180000:,:],f)

with bz2.BZ2File('train_label_128.pbz2','w') as f:
    pickle.dump(labels, f)

with bz2.BZ2File('test_data_128.pbz2','w') as f:
    pickle.dump(vdata, f)

with bz2.BZ2File('test_label_128.pbz2','w') as f:
    pickle.dump(vlabels, f)