# Preparing the data sets for both models
### Sorting all the images into the required folders so that we can easily make datasets for training using the pytorch ImageFolder function

In [27]:
import torch
import torchvision
import torchvision.transforms as transforms
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import shutil
from PIL import Image

In [6]:
excel_path = '/Users/aveek/Downloads/MINI-DDSM-Complete-JPEG-8/DataWMask.xlsx'
df = pd.read_excel(excel_path)

In [7]:
df.head()

Unnamed: 0,fullPath,fileName,View,Side,Status,Tumour_Contour,Tumour_Contour2,Age,Density
0,Benign\0029\C_0029_1.LEFT_CC.jpg,C_0029_1.LEFT_CC.jpg,CC,LEFT,Benign,Benign\0029\C_0029_1.LEFT_CC_Mask.jpg,-,66.0,3
1,Benign\0029\C_0029_1.LEFT_MLO.jpg,C_0029_1.LEFT_MLO.jpg,MLO,LEFT,Benign,Benign\0029\C_0029_1.LEFT_MLO_Mask.jpg,-,66.0,3
2,Benign\0029\C_0029_1.RIGHT_CC.jpg,C_0029_1.RIGHT_CC.jpg,CC,RIGHT,Benign,-,-,66.0,3
3,Benign\0029\C_0029_1.RIGHT_MLO.jpg,C_0029_1.RIGHT_MLO.jpg,MLO,RIGHT,Benign,-,-,66.0,3
4,Benign\0033\C_0033_1.LEFT_CC.jpg,C_0033_1.LEFT_CC.jpg,CC,LEFT,Benign,-,-,60.0,3


## Process

#### Part 1: detection of tumours
- Create folders for tumour (1) and no tumour (0)<br>
- Gonna cycle through every filepath in the excel and check for whether tumour_contour path is not empty, so tumour is present <br>
- if tumour present, image copied to '1' folder, otherwise copied to 0 folder
  - this means that we are doing a breast by breast approach, rather than by the patients in the folder. basically, as long as one breast has a tumour detected, the patient is classified as cancerous and put in the benign/cancer folder, but that doesnt mean that there are tumours in both breasts. we hence need to classify each breast separately as cancerous or not, rather than the image
  - will have to apply this system when doing our metrics as well

<br>

#### Part 2: classification of breast density
Create folders for the different density classifications 1,2,3,4 corresponding to A,B,C,D
- may be a bit more challenging as both breasts are given the same density classification
- cycle through each breast, and put them in corresponding folders 1,2,3,4

_______________


In [99]:
# helper functions

# don't really need this helper function anymore as we're not flipping images
def save_images(path, new_path):
    """saves / copies images into new folders. flips left view images to right view to standardise view

    Args:
        path (string): original path to read images
        new_path (string): new training data folders for model training
    """
    
    if 'LEFT' in path:
        # print('here')
        
        im = Image.open(path)
        im = im.transpose(Image.FLIP_LEFT_RIGHT)
        
        name = path.split('/')[-1]
        # print(name)
        im = im.save(new_path + name)
        return 0
        
    else:
        shutil.copy(path, new_path)
        return 1
        
    # return count_L, count_R

______
## Part 1 : Tumour detection

In [102]:
# was trying to do it some way which included flipping the images into the same view, but decided against it
ddsm_path = '/Users/aveek/Downloads/MINI-DDSM-Complete-JPEG-8/'

tumour_path = '/Users/aveek/development/breast-cancer-masking/Breast-Cancer-Masking/training-data/tumour-detection/1/'
no_tumour_path = '/Users/aveek/development/breast-cancer-masking/Breast-Cancer-Masking/training-data/tumour-detection/0/'


for idx in df.index[:]:
    # print(idx)
    # print(df.iloc[idx].Tumour_Contour)
    contour_path = df.iloc[idx].Tumour_Contour
    img_path = df.iloc[idx].fullPath
    img_path = img_path.replace('\\', '/')
    
    # print((ddsm_path+img_path).split('.')[1])
    
    if contour_path == '-':
        # this means no tumour in this image
        shutil.copy(ddsm_path+img_path, no_tumour_path)
        
    else:
        # tumour present in this image
        shutil.copy(ddsm_path+img_path, tumour_path)
        

# print(count_L,count_R)


In [None]:
df.Status.value_counts()

Status
Cancer    2716
Benign    2684
Normal    2408
Name: count, dtype: int64

_____
## Part 2: Density classification

In [69]:
df.Density.value_counts()

Density
2    3020
3    2320
4    1388
1    1076
0       4
Name: count, dtype: int64

#### not sute why there are 4 counts of density class 0, there should only be 4 classes and i assume ABCD is 1234

In [68]:
df.loc[df.Density == 0]

Unnamed: 0,fullPath,fileName,View,Side,Status,Tumour_Contour,Tumour_Contour2,Age,Density
4236,Cancer\1825\A_1825_1.LEFT_CC.jpg,A_1825_1.LEFT_CC.jpg,CC,LEFT,Cancer,-,-,52.0,0
4237,Cancer\1825\A_1825_1.LEFT_MLO.jpg,A_1825_1.LEFT_MLO.jpg,MLO,LEFT,Cancer,-,-,52.0,0
4238,Cancer\1825\A_1825_1.RIGHT_CC.jpg,A_1825_1.RIGHT_CC.jpg,CC,RIGHT,Cancer,Cancer\1825\A_1825_1.RIGHT_CC_Mask.jpg,-,52.0,0
4239,Cancer\1825\A_1825_1.RIGHT_MLO.jpg,A_1825_1.RIGHT_MLO.jpg,MLO,RIGHT,Cancer,Cancer\1825\A_1825_1.RIGHT_MLO_Mask.jpg,-,52.0,0


#### seems like it's only this one patient that is classified as 0. some kinda mistake so can ignore i guess

### sorting the images into 0,1,2,3 folders

In [103]:
ddsm_path = '/Users/aveek/Downloads/MINI-DDSM-Complete-JPEG-8/'


save_path = '/Users/aveek/development/breast-cancer-masking/Breast-Cancer-Masking/training-data/density-classification/'
labels = ['0','1','2','3']

for idx in df.index:
    # print(idx)
    # print(df.iloc[idx].Tumour_Contour)
    d_class = df.iloc[idx].Density
    img_path = df.iloc[idx].fullPath
    img_path = img_path.replace('\\', '/')
    
    # print((ddsm_path+img_path).split('.')[1])
    
    shutil.copy(ddsm_path+img_path, save_path+labels[d_class-1]+'/')
    # note the difference between the classification labels on the excel file (1-4) and the labels i am using 
    # for the folders (0-3) which is used in machine learning model ouputs (like how we index in python)



FileNotFoundError: [Errno 2] No such file or directory: '/Users/aveek/development/breast-cancer-masking/Breast-Cancer-Masking/training-data/density-classification-2/2/'

In [76]:
save_images('/Users/aveek/Downloads/MINI-DDSM-Complete-JPEG-8/Benign/0236/C_0236_1.LEFT_MLO.jpg','/Users/aveek/development/breast-cancer-masking/Breast-Cancer-Masking/training-data/tumour-detection/test/')

In [101]:
# was trying to do it some way which included flipping the images into the same view, but decided against it
ddsm_path = '/Users/aveek/Downloads/MINI-DDSM-Complete-JPEG-8/'

tumour_path = '/Users/aveek/development/breast-cancer-masking/Breast-Cancer-Masking/training-data/tumour-detection/1/'
no_tumour_path = '/Users/aveek/development/breast-cancer-masking/Breast-Cancer-Masking/training-data/tumour-detection/0/'

test_path = '/Users/aveek/development/breast-cancer-masking/Breast-Cancer-Masking/training-data/tumour-detection/test/'

count_L = 0
count_R = 0

for idx in df.index[:]:
    # print(idx)
    # print(df.iloc[idx].Tumour_Contour)
    contour_path = df.iloc[idx].Tumour_Contour
    img_path = df.iloc[idx].fullPath
    img_path = img_path.replace('\\', '/')
    
    # print((ddsm_path+img_path).split('.')[1])
    
    
    if contour_path == '-':
        # this means no tumour in this image
        save_images(ddsm_path+img_path, no_tumour_path)
        
    else:
        # tumour present in this image
        save_images(ddsm_path+img_path, tumour_path)
        

# print(count_L,count_R)


3904 3904


## seems like it's more complicated than i thought..... 
there was an issue with the laterality of some images so i went to debugging and looking at the 
differences in the ics files and found out that there are 2 different 'Digitizers' used. the images that use Lumisys
have the correct laterality while the ones that use HOWTEK 43.5 have the opposite laterality.....
so seems like we gotta check the digitizer as well first before sorting the laterality. hais

### oh no turns out its still wrong as some files digitzed with howtek 43.5 are in the correct laterality
#### think the only way forward is to do a check of pixel values on left and rigth to determine the laterality