# Task 2 Part 1 Pipeline

This notebook can be used to combine the images from seperate datasets (currently Mendelay Hussain, CRIC, and CDetector), standardize their names and labels, and split them into train, test, and val directories. No resizing, transformation, or augmentation is being done here.

This notebook requires access to module code from models/data and data


Author(s): Gaylyn Ruvere using modified pipeline module originally written by Leon Hamnett and Paolo Dano 


Date: 2022_09_18

# Mount Google Drive

In [1]:
# This step is optional if not using colab and google drive

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Customize Config

In [2]:
# Customize and write the config.py file
# You may need to restart runtime if you run this cell more than once with changes

import os
from pathlib import Path


# Data Paths
MY_PATH = '/content/drive/My Drive/DrCadx'
DATA_FOLDER = MY_PATH + '/data'

# Dataset Paths
CRIC_DATASET_PATH = os.path.join(DATA_FOLDER, 'structured', 'CRIC_Dataset_Structured')
MEND_DATASET_PATH = os.path.join(DATA_FOLDER, 'raw', 'Image_datasets', 'partner_provided_datasets', 'mendeley_hussain_liquid_based_cytology')
CDET_DATASET_PATH = os.path.join(DATA_FOLDER, 'structured', 'CDetector_structured')

#ASCY_DATASET_PATH = os.path.join(DATA_FOLDER, 'structured', 'ASCYscraped_structured')
#IARC_DATASET_PATH = os.path.join(DATA_FOLDER, 'structured', 'IARCscraped_structured')

print(CRIC_DATASET_PATH)

# NC Labels
CRIC_NC_LABEL = 'NILM'
MEND_NC_LABEL = 'NL'
CDET_NC_LABEL = 'NILM'
ASCY_NC_LABEL = 'NILM'
IARC_NC_LABEL = 'NILM' #There actually aren't any in this set

# Processed Path
CERVAI_PATH = os.path.join(DATA_FOLDER, 'CervAI')
#DEST_FOLDER = '/content/gdrive/.shortcut-targets-by-id/1MN_MgyhaHPoUpQYCws4tOpGbXA-NfwN_/Official_Folder_for_CervAi'
#DEST_DATA_FOLDER = os.path.join(DEST_FOLDER, 'Data')
#CERVAI_PATH = os.path.join(DEST_DATA_FOLDER, 'CervAI_minside300_fixed')
METADATA_PATH = os.path.join(CERVAI_PATH, 'MetaData.md')
CLASS_LABELS = ['NILM', 'SCC', 'LSIL', 'ASC-US', 'ASC-H', 'HSIL']
CLASS_LABELS_NILM = ['NL', 'NILM', 'ACTIN', 'AGC', 'TRICH', 'CAND', 'FLORA', 'HERPS']
CLASS_LABELS_OMIT = ['AGC','AGC-FN','AIS','ADC'] # Glandular cells, we don't have enough data to work with them now

# Train Test Resources
TRAIN_PATH = os.path.join(CERVAI_PATH, 'train')
TEST_PATH = os.path.join(CERVAI_PATH, 'test')
VAL_PATH = os.path.join(CERVAI_PATH, 'val')
SPLIT = [0.8, 0.1, 0.1]


# (Over)write the config.py file
with open(MY_PATH + '/modules/data/resources/config.py', 'w') as f:
    f.write(f"DATA_FOLDER = '{DATA_FOLDER}'\n")
    f.write(f"CRIC_DATASET_PATH = '{CRIC_DATASET_PATH}'\n")
    f.write(f"MEND_DATASET_PATH = '{MEND_DATASET_PATH}'\n")
    f.write(f"CDET_DATASET_PATH = '{CDET_DATASET_PATH}'\n")
    f.write(f"ASCY_DATASET_PATH = '{ASCY_DATASET_PATH}'\n")
    f.write(f"IARC_DATASET_PATH = '{IARC_DATASET_PATH}'\n")
    f.write(f"CRIC_NC_LABEL = '{CRIC_NC_LABEL}'\n")
    f.write(f"MEND_NC_LABEL = '{MEND_NC_LABEL}'\n")
    f.write(f"CDET_NC_LABEL = '{CDET_NC_LABEL}'\n")
    f.write(f"ASCY_NC_LABEL = '{ASCY_NC_LABEL}'\n")
    f.write(f"IARC_NC_LABEL = '{IARC_NC_LABEL}'\n")
    f.write(f"CERVAI_PATH = '{CERVAI_PATH}'\n")
    f.write(f"METADATA_PATH = '{METADATA_PATH}'\n")
    f.write(f"CLASS_LABELS = {CLASS_LABELS}\n")
    f.write(f"CLASS_LABELS_NILM = {CLASS_LABELS_NILM}\n")
    f.write(f"CLASS_LABELS_OMIT = {CLASS_LABELS_OMIT}\n")
    f.write(f"TRAIN_PATH = '{TRAIN_PATH}'\n")
    f.write(f"TEST_PATH = '{TEST_PATH}'\n")
    f.write(f"VAL_PATH = '{VAL_PATH}'\n")
    f.write(f"SPLIT = {SPLIT}\n")

    f.close()

/content/drive/My Drive/DrCadx/data/structured/CRIC_Dataset_Structured


# Combine and generate train/val/test sets

In [3]:
import sys
sys.path.append(MY_PATH + '/modules/data')

# Import the data pipeline module
import ETL_mix_datasets

# Combine the datasets
ETL_mix_datasets.run_pipeline()

All datasets present
CervAI folder already present, confirm delete existing folder? (y/n) - y
Existing Folder deleted
Made new cervAi folder...
Extracted CRIC paths and labels
Extracted MEND paths and labels
Extracted CDET paths and labels

CervAI Folder Created!
