In [1]:
#!pip install fastai --upgrade

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
from fastai.vision.all import *
from glob import glob

In [2]:
def get_subject_ids(filename_list): 
    return [filename.split('/')[-3].split('patient')[-1] for filename in filename_list]

def get_study_labels(filename_list): 
    return [filename.split('/')[-2].split('_')[-1] for filename in filename_list]

In [3]:
def get_study_id(filename_list): 
    return [filename.split('/')[-2].split('_')[0] for filename in filename_list]

In [4]:
#NB_DIR = Path.cwd()
#STUDY_DIRECTORY = NB_DIR/'..'/'data'/'MURA_sample' 

STUDY_DIRECTORY =  Path('gdrive/My Drive/MURA_sample/') 
category = 'XR_HAND'

## Get training data

In [5]:
train_data = sorted(glob(str(STUDY_DIRECTORY/'train'/category/'patient*'/'study*'/'*.png')))
len(train_data)

5543

In [6]:
train_data[0]

'/home/sathiesh/machine_learning/rad230_dl/../data/MURA_sample/train/XR_HAND/patient00008/study1_positive/image1.png'

In [7]:
train_subject_ids = get_subject_ids(train_data)
train_labels = get_study_labels(train_data)
train_study_ids = get_study_id(train_data)
train_subject_ids[0], train_labels[0], train_study_ids[0]

('00008', 'positive', 'study1')

## Get validation data

In [8]:
valid_data = sorted(glob(str(STUDY_DIRECTORY/'valid'/category/'patient*'/'study*'/'*.png')))
len(valid_data)

460

In [9]:
valid_data[0]

'/home/sathiesh/machine_learning/rad230_dl/../data/MURA_sample/valid/XR_HAND/patient11190/study1_negative/image1.png'

In [10]:
valid_subject_ids = get_subject_ids(valid_data)
valid_labels = get_study_labels(valid_data)
valid_study_ids = get_study_id(valid_data)
valid_subject_ids[0], valid_labels[0], valid_study_ids[0]

('11190', 'negative', 'study1')

## Create dataframe

In [11]:
columns=['img_path', 'subject_id', 'study_id', 'label']

In [12]:
df_train = pd.DataFrame(list(zip(train_data, train_subject_ids, train_study_ids, train_labels)), columns=columns)
df_train['is_valid'] = False

In [13]:
df_val = pd.DataFrame(list(zip(valid_data, valid_subject_ids, valid_study_ids, valid_labels)), columns=columns)
df_val['is_valid'] = True

In [14]:
#concatenate training and validation dataframes
frames = [df_train, df_val]
df = pd.concat(frames)
df['study_id'] = df.subject_id + '_' + df.study_id

In [15]:
df.head()

Unnamed: 0,img_path,subject_id,study_id,label,is_valid
0,/home/sathiesh/machine_learning/rad230_dl/../data/MURA_sample/train/XR_HAND/patient00008/study1_positive/image1.png,8,00008_study1,positive,False
1,/home/sathiesh/machine_learning/rad230_dl/../data/MURA_sample/train/XR_HAND/patient00008/study1_positive/image2.png,8,00008_study1,positive,False
2,/home/sathiesh/machine_learning/rad230_dl/../data/MURA_sample/train/XR_HAND/patient00008/study1_positive/image3.png,8,00008_study1,positive,False
3,/home/sathiesh/machine_learning/rad230_dl/../data/MURA_sample/train/XR_HAND/patient00050/study1_negative/image1.png,50,00050_study1,negative,False
4,/home/sathiesh/machine_learning/rad230_dl/../data/MURA_sample/train/XR_HAND/patient00050/study1_negative/image2.png,50,00050_study1,negative,False


In [16]:
df.to_csv('data.csv', index=False)