In [2]:
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import pydicom as dicom
from skimage.exposure import rescale_intensity
import cv2
from os import path

In [3]:
from PIL import Image, ImageOps


def padding(img, expected_size):
    desired_size = expected_size
    delta_width = desired_size - img.size[0]
    delta_height = desired_size - img.size[1]
    pad_width = delta_width // 2
    pad_height = delta_height // 2
    padding = (pad_width, pad_height, delta_width - pad_width, delta_height - pad_height)
    return ImageOps.expand(img, padding)


def resize_with_padding(img, expected_size):
    img.thumbnail((expected_size[0], expected_size[1]))
    # print(img.size)
    delta_width = expected_size[0] - img.size[0]
    delta_height = expected_size[1] - img.size[1]
    pad_width = delta_width // 2
    pad_height = delta_height // 2
    padding = (pad_width, pad_height, delta_width - pad_width, delta_height - pad_height)
    return ImageOps.expand(img, padding)

def fix_benign(x):
    if x == 'BENIGN_WITHOUT_CALLBACK':
        return 'BENIGN'
    return x

## CHINESE DATA PROCESSING

In [4]:
indicator_location = '/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/Chinese_mamography/manifest-1616439774456/CMMD_clinicaldata_revision.xlsx'

metadata_location = '/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/Chinese_mamography/manifest-1616439774456/metadata.csv'

In [5]:
df_meta = pd.read_csv(metadata_location)
df_meta = df_meta[['Subject ID', 'File Location']]
df_meta.columns = ['ID1', 'Location']
print(df_meta.head())
print(len(df_meta))

       ID1                                           Location
0  D1-0002  ./CMMD/D1-0002/07-18-2010-NA-NA-49231/1.000000...
1  D1-0001  ./CMMD/D1-0001/07-18-2010-NA-NA-79377/1.000000...
2  D1-0004  ./CMMD/D1-0004/07-18-2011-NA-NA-14914/1.000000...
3  D1-0003  ./CMMD/D1-0003/07-18-2011-NA-NA-25491/1.000000...
4  D1-0006  ./CMMD/D1-0006/07-18-2010-NA-NA-16802/1.000000...
1775


In [6]:
df_indicator = pd.read_excel(indicator_location)
print(df_indicator.head())
print(len(df_indicator))

       ID1 LeftRight  Age  number    abnormality classification subtype
0  D1-0001         R   44       2  calcification         Benign     NaN
1  D1-0002         L   40       2  calcification         Benign     NaN
2  D1-0003         L   39       2  calcification         Benign     NaN
3  D1-0004         L   41       2  calcification         Benign     NaN
4  D1-0005         R   42       2  calcification         Benign     NaN
1872


In [7]:
df_full = pd.merge(df_indicator, df_meta, left_on='ID1', right_on='ID1')
df_full.columns = df_full.columns[1:].insert(0 ,'ID')
df_full.head()

Unnamed: 0,ID,LeftRight,Age,number,abnormality,classification,subtype,Location
0,D1-0001,R,44,2,calcification,Benign,,./CMMD/D1-0001/07-18-2010-NA-NA-79377/1.000000...
1,D1-0002,L,40,2,calcification,Benign,,./CMMD/D1-0002/07-18-2010-NA-NA-49231/1.000000...
2,D1-0003,L,39,2,calcification,Benign,,./CMMD/D1-0003/07-18-2011-NA-NA-25491/1.000000...
3,D1-0004,L,41,2,calcification,Benign,,./CMMD/D1-0004/07-18-2011-NA-NA-14914/1.000000...
4,D1-0005,R,42,2,calcification,Benign,,./CMMD/D1-0005/07-18-2011-NA-NA-52347/1.000000...


In [9]:
##### GETTING DATAFRAME WITH RELEVANT DATA INFORMATION AND LOCATION

im_counter = 0
metadata_container = {}

screening_type = ['CC', 'MLO']

for idx in df_full['ID'].unique():
    #image_specific_folder = df_full.iloc[0]['Location'][1:] # assign the folder with the corresponding file
    relevant_cut = df_full[df_full['ID'] == idx]
    if len(relevant_cut) == 1:
        images_to_extract = 2
    elif len(relevant_cut) == 2:
        images_to_extract = 4
    else:
        raise RuntimeError('Something is off in the loading of the data for the index: ', idx)

    image_folder_path = relevant_cut['Location'].unique()[0][1:]
    LR_array = relevant_cut['LeftRight'].to_numpy()
    abnormality = relevant_cut['abnormality'].to_numpy()

    for im_idx in range(1, images_to_extract + 1):
        image_local_name = '/1-' + str(im_idx) + '.dcm'
        im_location = image_folder_path + image_local_name # im final location
        if im_idx <= 2:
            lr = LR_array[0]
            case = relevant_cut['classification'].iloc[0]
            loc_abnormality = abnormality[0] 
        else:
            lr = LR_array[1]
            case = relevant_cut['classification'].iloc[1]
            loc_abnormality = abnormality[1]
        sc_type = screening_type[(im_idx+1)%2]
        metadata_container[im_counter] = [idx, lr, sc_type, loc_abnormality, case, im_location]
        im_counter += 1 

df_data_details = pd.DataFrame.from_dict(metadata_container, columns=['ID', 'LR', 'Method', 'Abnormality', 'Outcome', 'Original_Location'], orient = 'index')
df_data_details.head()

Unnamed: 0,ID,LR,Method,Abnormality,Outcome,Original_Location
0,D1-0001,R,CC,calcification,Benign,/CMMD/D1-0001/07-18-2010-NA-NA-79377/1.000000-...
1,D1-0001,R,MLO,calcification,Benign,/CMMD/D1-0001/07-18-2010-NA-NA-79377/1.000000-...
2,D1-0002,L,CC,calcification,Benign,/CMMD/D1-0002/07-18-2010-NA-NA-49231/1.000000-...
3,D1-0002,L,MLO,calcification,Benign,/CMMD/D1-0002/07-18-2010-NA-NA-49231/1.000000-...
4,D1-0003,L,CC,calcification,Benign,/CMMD/D1-0003/07-18-2011-NA-NA-25491/1.000000-...


In [10]:
# prepare local paths
root_path = '/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/Chinese_mamography/manifest-1616439774456'

save_path = '/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/Chinese_mamography/extracted_dataset/'

merged_save_path = '/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/final_dataset/all/'

In [13]:
from PIL import Image

save_full = False
save_resized = True

names_container = []
new_location_container = []
failed_ids = []
folder = 'unprocessed/' # which folder to put the images into

for idx in range(len(df_data_details)):
    name = 'breast_scan_china_' + str(idx) + '.jpeg' # determine the new name of the file

    im_metadata = df_data_details.iloc[idx]
    image_path = root_path + im_metadata['Original_Location']
    
    ds = dicom.dcmread(image_path) # load the dicom file
    image = ds.pixel_array # extract the image from it


    im = Image.fromarray(image) # convert to PIL and save
    try:
        if save_full:
            im.save(save_path + folder + name, quality=100)
        if save_resized:
            img = resize_with_padding(im, (224, 224))
            img.save(merged_save_path + name, quality=100)
        names_container.append(name)
        new_location_container.append(folder + name)
    except:
        print('Failed for:', im_metadata)
        failed_ids.append(idx)

Failed for: ID                                                             D1-1343
LR                                                                   L
Method                                                              CC
Abnormality                                                       mass
Outcome                                                      Malignant
Original_Location    /CMMD/D1-1343/07-18-2010-NA-NA-65588/1.000000-...
Name: 1836, dtype: object
Failed for: ID                                                             D1-1343
LR                                                                   L
Method                                                             MLO
Abnormality                                                       mass
Outcome                                                      Malignant
Original_Location    /CMMD/D1-1343/07-18-2010-NA-NA-65588/1.000000-...
Name: 1837, dtype: object


In [11]:
# drop images that were not processed
df_data_details = df_data_details.drop(labels=failed_ids, axis=0)

NameError: name 'failed_ids' is not defined

In [12]:
df_data_details['file_name'] = names_container
df_data_details['extracted_location'] = new_location_container

NameError: name 'names_container' is not defined

In [63]:
# save the metadata
df_data_details.to_csv(save_path + 'metadata.csv')

## KAGGLE DATA PROCESSING

In [44]:
# KAGGLE DATA PROCESSING ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------------
import regex as re

def rpl(im):
    final = re.match('.*\/', im)[0]
    return final.replace(re.match('[^\/]*\/[^\/]*\/*',re.match('.*\/', im)[0])[0], '')[:-1]

In [45]:
df_calc_train = pd.read_csv('/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/CBIS-DDSM/csv/calc_case_description_train_set.csv')
df_calc_test = pd.read_csv('/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/CBIS-DDSM/csv/calc_case_description_test_set.csv')
df_mass_train = pd.read_csv('/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/CBIS-DDSM/csv/mass_case_description_train_set.csv')
df_mass_test = pd.read_csv('/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/CBIS-DDSM/csv/mass_case_description_test_set.csv')



dicom_info = '/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/CBIS-DDSM/csv/dicom_info.csv'
df_dicom_info = pd.read_csv(dicom_info)
print(len(df_dicom_info))

10237


In [46]:
df_original_meta = pd.concat([pd.concat([pd.concat([df_calc_train,df_calc_test], ignore_index=True), df_mass_train], ignore_index=True), df_mass_test], ignore_index=True)

In [47]:
subset_id = []
for i in range(len(df_original_meta['image file path'])):
    subset_id.append(rpl(df_original_meta['image file path'][i]))
df_original_meta['match_ID'] = subset_id
df_original_meta.head()

Unnamed: 0,patient_id,breast density,left or right breast,image view,abnormality id,abnormality type,calc type,calc distribution,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path,breast_density,mass shape,mass margins,match_ID
0,P_00005,3.0,RIGHT,CC,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,Calc-Training_P_00005_RIGHT_CC/1.3.6.1.4.1.959...,Calc-Training_P_00005_RIGHT_CC_1/1.3.6.1.4.1.9...,Calc-Training_P_00005_RIGHT_CC_1/1.3.6.1.4.1.9...,,,,1.3.6.1.4.1.9590.100.1.2.474143160103683865197...
1,P_00005,3.0,RIGHT,MLO,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,Calc-Training_P_00005_RIGHT_MLO/1.3.6.1.4.1.95...,Calc-Training_P_00005_RIGHT_MLO_1/1.3.6.1.4.1....,Calc-Training_P_00005_RIGHT_MLO_1/1.3.6.1.4.1....,,,,1.3.6.1.4.1.9590.100.1.2.250596608311207922527...
2,P_00007,4.0,LEFT,CC,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,Calc-Training_P_00007_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00007_LEFT_CC_1/1.3.6.1.4.1.95...,Calc-Training_P_00007_LEFT_CC_1/1.3.6.1.4.1.95...,,,,1.3.6.1.4.1.9590.100.1.2.228699627313487111012...
3,P_00007,4.0,LEFT,MLO,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,Calc-Training_P_00007_LEFT_MLO/1.3.6.1.4.1.959...,Calc-Training_P_00007_LEFT_MLO_1/1.3.6.1.4.1.9...,Calc-Training_P_00007_LEFT_MLO_1/1.3.6.1.4.1.9...,,,,1.3.6.1.4.1.9590.100.1.2.104743410411133110629...
4,P_00008,1.0,LEFT,CC,1,calcification,,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3,Calc-Training_P_00008_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00008_LEFT_CC_1/1.3.6.1.4.1.95...,Calc-Training_P_00008_LEFT_CC_1/1.3.6.1.4.1.95...,,,,1.3.6.1.4.1.9590.100.1.2.406725628213826290127...


In [48]:
im_types = df_dicom_info['SeriesDescription'].unique() # find the unique image types in the dataset
print(im_types)

# limit to only full mammography
df_dicom_info = df_dicom_info[df_dicom_info['SeriesDescription']=='full mammogram images']
df_dicom_info.head()

['cropped images' 'full mammogram images' nan 'ROI mask images']


Unnamed: 0,file_path,image_path,AccessionNumber,BitsAllocated,BitsStored,BodyPartExamined,Columns,ContentDate,ContentTime,ConversionType,...,SecondaryCaptureDeviceManufacturerModelName,SeriesDescription,SeriesInstanceUID,SeriesNumber,SmallestImagePixelValue,SpecificCharacterSet,StudyDate,StudyID,StudyInstanceUID,StudyTime
1,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.24838...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.248386...,,16,16,BREAST,3526,20160426,143829.101,WSD,...,MATLAB,full mammogram images,1.3.6.1.4.1.9590.100.1.2.248386742010678582309...,1,0,ISO_IR 100,20160720.0,DDSM,1.3.6.1.4.1.9590.100.1.2.161516517311681906612...,193426.0
2,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.26721...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.267213...,,16,16,BREAST,1546,20160503,111956.298,WSD,...,MATLAB,full mammogram images,1.3.6.1.4.1.9590.100.1.2.267213171011171858918...,1,0,ISO_IR 100,20160807.0,DDSM,1.3.6.1.4.1.9590.100.1.2.291043622711253836701...,161814.0
11,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.21039...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.210396...,,16,16,BREAST,2491,20160426,130913.563,WSD,...,MATLAB,full mammogram images,1.3.6.1.4.1.9590.100.1.2.210396893911234385024...,1,0,ISO_IR 100,20160720.0,DDSM,1.3.6.1.4.1.9590.100.1.2.847966910128506698044...,190507.0
12,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.74956...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.749566...,,16,16,BREAST,4040,20160503,114631.174,WSD,...,MATLAB,full mammogram images,1.3.6.1.4.1.9590.100.1.2.749566583113839572014...,1,0,ISO_IR 100,20160807.0,DDSM,1.3.6.1.4.1.9590.100.1.2.199136909412566562327...,162929.0
15,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.98765...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.987658...,,16,16,BREAST,3080,20160503,133703.461,WSD,...,MATLAB,full mammogram images,1.3.6.1.4.1.9590.100.1.2.987658854129464108236...,1,0,ISO_IR 100,20160807.0,DDSM,1.3.6.1.4.1.9590.100.1.2.240827364811492781021...,171811.0


In [49]:
df_kaggle_full = pd.merge(df_dicom_info, df_original_meta, left_on='SeriesInstanceUID', right_on='match_ID')
df_kaggle_full = df_kaggle_full[['image_path', 'Laterality', 'PatientOrientation', 'SeriesDescription', 'patient_id', 'breast density', 'image view', 'abnormality type', 'pathology', 'left or right breast']] # restrict
df_kaggle_full.head()

df_kaggle_full['pathology'] = df_kaggle_full['pathology'].apply(fix_benign)
df_kaggle_full.head()

Unnamed: 0,image_path,Laterality,PatientOrientation,SeriesDescription,patient_id,breast density,image view,abnormality type,pathology,left or right breast
0,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.248386...,R,CC,full mammogram images,P_01754,,CC,mass,MALIGNANT,RIGHT
1,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.267213...,R,CC,full mammogram images,P_00232,4.0,CC,calcification,MALIGNANT,RIGHT
2,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.210396...,R,MLO,full mammogram images,P_01206,,MLO,mass,MALIGNANT,RIGHT
3,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.749566...,R,MLO,full mammogram images,P_00506,2.0,MLO,calcification,BENIGN,RIGHT
4,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.987658...,R,CC,full mammogram images,P_01823,3.0,CC,calcification,BENIGN,RIGHT


In [68]:
from PIL import Image

save_full = False
save_resized = True

### load the images and save them somewhere else
root_path = '/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/'
save_location = '/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/CBIS-DDSM/extracted_dataset/unprocessed/'
merged_save_path = '/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/final_dataset/all/'

new_location_container = []
names_container = []
failed_ids = []

for i in range(len(df_kaggle_full)):
    name = 'cbis_ddsm_' + str(i) + '.jpg'
    save_loc = save_location + name
    im_loc = root_path + df_kaggle_full.iloc[i]['image_path']
    with Image.open(im_loc) as im:    
        try:
            if save_full:
                im.save(save_loc, quality=100)
            if save_resized:
                img = resize_with_padding(im, (224, 224))
                img.save(merged_save_path + name, quality=100)
            names_container.append(name)
            new_location_container.append(save_loc)
        except:
            print('Failed for:', df_kaggle_full.iloc[i])
            failed_ids.append(idx)

df_kaggle_full = df_kaggle_full.drop(labels=failed_ids, axis=0)
df_kaggle_full['file_name'] = names_container
df_kaggle_full['extracted_location'] = new_location_container            

## Merging data

In [72]:
df_kaggle_full.head()

Unnamed: 0,image_path,Laterality,PatientOrientation,SeriesDescription,patient_id,breast density,image view,abnormality type,pathology,left or right breast,file_name,extracted_location
0,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.248386...,R,CC,full mammogram images,P_01754,,CC,mass,MALIGNANT,RIGHT,cbis_ddsm_0.jpg,/Users/ryznerf/Documents/0_MIT/Spring_2022/0_P...
1,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.267213...,R,CC,full mammogram images,P_00232,4.0,CC,calcification,MALIGNANT,RIGHT,cbis_ddsm_1.jpg,/Users/ryznerf/Documents/0_MIT/Spring_2022/0_P...
2,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.210396...,R,MLO,full mammogram images,P_01206,,MLO,mass,MALIGNANT,RIGHT,cbis_ddsm_2.jpg,/Users/ryznerf/Documents/0_MIT/Spring_2022/0_P...
3,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.749566...,R,MLO,full mammogram images,P_00506,2.0,MLO,calcification,BENIGN,RIGHT,cbis_ddsm_3.jpg,/Users/ryznerf/Documents/0_MIT/Spring_2022/0_P...
4,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.987658...,R,CC,full mammogram images,P_01823,3.0,CC,calcification,BENIGN,RIGHT,cbis_ddsm_4.jpg,/Users/ryznerf/Documents/0_MIT/Spring_2022/0_P...


In [73]:
df_data_details.head()

Unnamed: 0,ID,LR,Method,Abnormality,Outcome,Original_Location,file_name,extracted_location
0,D1-0001,R,CC,calcification,Benign,/CMMD/D1-0001/07-18-2010-NA-NA-79377/1.000000-...,breast_scan_china_0.jpeg,unprocessed/breast_scan_china_0.jpeg
1,D1-0001,R,MLO,calcification,Benign,/CMMD/D1-0001/07-18-2010-NA-NA-79377/1.000000-...,breast_scan_china_1.jpeg,unprocessed/breast_scan_china_1.jpeg
2,D1-0002,L,CC,calcification,Benign,/CMMD/D1-0002/07-18-2010-NA-NA-49231/1.000000-...,breast_scan_china_2.jpeg,unprocessed/breast_scan_china_2.jpeg
3,D1-0002,L,MLO,calcification,Benign,/CMMD/D1-0002/07-18-2010-NA-NA-49231/1.000000-...,breast_scan_china_3.jpeg,unprocessed/breast_scan_china_3.jpeg
4,D1-0003,L,CC,calcification,Benign,/CMMD/D1-0003/07-18-2011-NA-NA-25491/1.000000-...,breast_scan_china_4.jpeg,unprocessed/breast_scan_china_4.jpeg


In [83]:
df_kaggle_full = df_kaggle_full[['Laterality', 'PatientOrientation', 'abnormality type', 'pathology', 'file_name']]
df_kaggle_full.columns = ['Laterality', 'PatientOrientation', 'Abnormality', 'Pathology', 'File_name']
df_kaggle_full['source'] = 'CBIS-DDSM'
df_kaggle_full.head()

Unnamed: 0,Laterality,PatientOrientation,Abnormality,Pathology,File_name,source
0,R,CC,mass,MALIGNANT,cbis_ddsm_0.jpg,CBIS-DDSM
1,R,CC,calcification,MALIGNANT,cbis_ddsm_1.jpg,CBIS-DDSM
2,R,MLO,mass,MALIGNANT,cbis_ddsm_2.jpg,CBIS-DDSM
3,R,MLO,calcification,BENIGN,cbis_ddsm_3.jpg,CBIS-DDSM
4,R,CC,calcification,BENIGN,cbis_ddsm_4.jpg,CBIS-DDSM


In [84]:
df_data_details = df_data_details[['LR', 'Method', 'Abnormality', 'Outcome', 'file_name']]
df_data_details.columns = ['Laterality', 'PatientOrientation', 'Abnormality', 'Pathology', 'File_name']
df_data_details['source'] = 'China'
df_data_details.head()

Unnamed: 0,Laterality,PatientOrientation,Abnormality,Pathology,File_name,source
0,R,CC,calcification,Benign,breast_scan_china_0.jpeg,China
1,R,MLO,calcification,Benign,breast_scan_china_1.jpeg,China
2,L,CC,calcification,Benign,breast_scan_china_2.jpeg,China
3,L,MLO,calcification,Benign,breast_scan_china_3.jpeg,China
4,L,CC,calcification,Benign,breast_scan_china_4.jpeg,China


In [91]:
df_full_metadata = pd.concat([df_kaggle_full, df_data_details], ignore_index=True)
df_full_metadata['Pathology'] = df_full_metadata['Pathology'].apply(lambda x: x.lower())
df_full_metadata.to_csv('/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/final_dataset/all_metadata.csv')

In [93]:
df_full_metadata.head()

Unnamed: 0,Laterality,PatientOrientation,Abnormality,Pathology,File_name,source
0,R,CC,mass,malignant,cbis_ddsm_0.jpg,CBIS-DDSM
1,R,CC,calcification,malignant,cbis_ddsm_1.jpg,CBIS-DDSM
2,R,MLO,mass,malignant,cbis_ddsm_2.jpg,CBIS-DDSM
3,R,MLO,calcification,benign,cbis_ddsm_3.jpg,CBIS-DDSM
4,R,CC,calcification,benign,cbis_ddsm_4.jpg,CBIS-DDSM


## Saving to different folders

In [101]:
import shutil, os

failed_data = []
types = ['malignant', 'benign']
root_path = '/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/final_dataset/all/'
save_location = '/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/final_dataset/training_data'

for pat_type in types:
    file_names = df_full_metadata[df_full_metadata['Pathology'] == pat_type]['File_name'].to_numpy()
    for f in file_names:
        try:
            shutil.copy(root_path+f, save_location + '/' + pat_type)
        except:
            failed_data.append(f)
            print('Failed for:', f)

Failed for: breast_scan_china_3742.jpeg
Failed for: breast_scan_china_3743.jpeg


In [110]:
for failed in failed_data:
    df_full_metadata = df_full_metadata[df_full_metadata['File_name'] != f]
df_full_metadata.to_csv('/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/final_dataset/all_metadata.csv')

In [15]:
!pip install split-folders

Collecting split-folders
  Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.5.1


In [16]:
df_full_metadata = pd.read_csv('/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/final_dataset/all_metadata.csv')
df_full_metadata.head()

Unnamed: 0.1,Unnamed: 0,Laterality,PatientOrientation,Abnormality,Pathology,File_name,source
0,0,R,CC,mass,malignant,cbis_ddsm_0.jpg,CBIS-DDSM
1,1,R,CC,calcification,malignant,cbis_ddsm_1.jpg,CBIS-DDSM
2,2,R,MLO,mass,malignant,cbis_ddsm_2.jpg,CBIS-DDSM
3,3,R,MLO,calcification,benign,cbis_ddsm_3.jpg,CBIS-DDSM
4,4,R,CC,calcification,benign,cbis_ddsm_4.jpg,CBIS-DDSM


In [18]:
import splitfolders

In [26]:
input_folder = '/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/final_dataset/training_data'
output_folder = '/Users/ryznerf/Documents/0_MIT/Spring_2022/0_Projects/0_Computer_Vision_project/6.869_Computer_Vision_project/Data/final_dataset/pytorch_data_input'
splitfolders.ratio(input_folder, output_folder, seed=1337, ratio=(.7, .15, .15), group_prefix=None, move=False) # default values


Copying files: 7026 files [00:02, 2945.51 files/s]


## Depicting situation 