# Data Preparation

## Imports

In [None]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Data

In [None]:
# Load Metadata
data_path = '/content/drive/My Drive/Breast_Cancer_Detection/Data/'

df_metadata_train = pd.read_csv(data_path + "train_metadata.csv")
df_metadata_test = pd.read_csv(data_path + "test_metadata.csv")

print('Training Metadata Shape: ' + str(df_metadata_train.shape))
print('Testing Metadata Shape: ' + str(df_metadata_test.shape) + '\n')

Training Metadata Shape: (344, 3)
Testing Metadata Shape: (149, 3)



In [None]:
# Load Training Targets
df_output_train = pd.read_csv(data_path + "train_output.csv")

# Merge Metadata and Targets for training data
df_metadata_y_train = df_metadata_train.merge(df_output_train, on="Sample ID")

In [None]:
# Load Training Data
X_train = []
y_train = []
centers_train = []
patients_train = []
zoom_train = []
coordinates_train = []


for sample, label, center, patient in tqdm(df_metadata_y_train[["Sample ID", "Target", "Center ID", "Patient ID"]].values):

    if patient not in patients_train:   # prevents duplicate patients in training set

        # Load the coordinates and features (1000, 3+2048)
        all_features = np.load(data_path + 'train/moco_features/' + sample)

        # Check each sample has 1000 tiles
        num_tiles = len(all_features)
        if num_tiles != 1000:
            print("Error: Sample does not have exactly 1000 tiles. This will cause errors.")

        # Remove the coordinates and zoom from MoCo features
        features = all_features[:, 3:]
        # Store zoom for each sample
        zoom = all_features[:, 0]
        # Store coordinates for each sample
        coordinates = all_features[:, 1:3]

        # Store all the data for each sample in 1 row (this will be reshaped back later)
        X_train.append(features.flatten())

        # Store data label and other metadata
        y_train.append(label)
        centers_train.append(center)
        patients_train.append(patient)
        zoom_train.append(zoom.flatten())
        coordinates_train.append(coordinates.flatten())

# Convert to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
centers_train = np.array(centers_train)
patients_train = np.array(patients_train)
zoom_train = np.array(zoom_train)
coordinates_train = np.array(coordinates_train)

# Save shape of features for reshaping X_train
feat_shape = np.shape(features)


100%|██████████| 344/344 [02:20<00:00,  2.44it/s]


In [None]:
print(f' X_train shape: {X_train.shape}')
print(f' y_train shape: {y_train.shape}')
print(f' centers_train shape: {centers_train.shape}')
print(f' patients_train shape: {patients_train.shape}')
print(f' zoom_train shape: {zoom_train.shape}')
print(f' coordinates_train shape: {coordinates_train.shape}')

 X_train shape: (305, 2048000)
 y_train shape: (305,)
 centers_train shape: (305,)
 patients_train shape: (305,)
 zoom_train shape: (305, 1000)
 coordinates_train shape: (305, 2000)


In [None]:
# Load Testing Data

X_test = []
centers_test = []
patients_test = []
zoom_test = []
coordinates_test = []

# Process test data (remove coordintes and flatten each sample into a row)
for sample, center, patient in tqdm(df_metadata_test[["Sample ID", "Center ID", "Patient ID"]].values):

    all_features = np.load(data_path + 'test/moco_features/' + sample)

    features = all_features[:, 3:]
    zoom = all_features[:, 0]
    coordinates = all_features[:, 1:3]
    X_test.append(features.flatten())

    centers_test.append(center)
    patients_test.append(patient)
    zoom_test.append(zoom.flatten())
    coordinates_test.append(coordinates.flatten())


X_test = np.array(X_test).astype(np.float16)
centers_test = np.array(centers_test)
patients_test = np.array(patients_test)
zoom_test = np.array(zoom_test)
coordinates_test = np.array(coordinates_test)

100%|██████████| 149/149 [01:04<00:00,  2.33it/s]


## Save Data

In [None]:
processed_data_path = '/content/drive/My Drive/Breast_Cancer_Detection/Processed_Data/'

np.save(processed_data_path + 'X_dev.npy', X_train)
np.save(processed_data_path + 'X_test.npy', X_test)

np.save(processed_data_path + 'y_dev.npy', y_train)

np.save(processed_data_path + 'centers_dev.npy', centers_train)
np.save(processed_data_path + 'patients_dev.npy', patients_train)
np.save(processed_data_path + 'zoom_train.npy', zoom_train)
np.save(processed_data_path + 'coordinates_dev.npy', coordinates_train)


np.save(processed_data_path + 'centers_test.npy', centers_test)
np.save(processed_data_path + 'patients_test.npy', patients_test)
np.save(processed_data_path + 'zoom_test.npy', zoom_test)
np.save(processed_data_path + 'coordinates_test.npy', coordinates_test)
