## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Data

In [4]:
processed_data_path = '/content/drive/My Drive/Breast_Cancer_Detection/Processed_Data/'

X_dev = np.load(processed_data_path + 'X_dev.npy')
X_test = np.load(processed_data_path + 'X_test.npy')

# Reshape so the cols are each MoCo Dimension
X_dev_reshaped = X_dev.reshape(-1, 2048)
print(X_dev_reshaped.shape)

X_test_reshaped = X_test.reshape(-1, 2048)

(344000, 2048)


## Remove Low Varying Features

In [None]:
# Set batch size
batch_size = 500

# Calculate the number of batches
num_batches = X_dev_reshaped.shape[1] // batch_size

# Initialize an empty array to store the results
X_dev_high_variance = []
X_test_high_variance = []

# Process features in batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size

    # Extract the batch of features
    dev_features_batch = X_dev_reshaped[:, start_idx:end_idx]
    test_features_batch = X_test_reshaped[:, start_idx:end_idx]

    # Calculate the variance of the batch
    variance_selector = VarianceThreshold(threshold=0.01)
    dev_high_variance_batch = variance_selector.fit_transform(dev_features_batch)
    test_high_variance_batch = variance_selector.transform(test_features_batch)

    # Append the high-variance features to the result
    X_dev_high_variance.append(dev_high_variance_batch)
    X_test_high_variance.append(test_high_variance_batch)

# Concatenate the results
X_dev_high_variance = np.hstack(X_dev_high_variance)
X_test_high_variance = np.hstack(X_test_high_variance)

## Reshape to 3D

In [6]:
# All Dimensions

X_dev_3D = X_dev_reshaped.reshape((-1, 1000, X_dev_reshaped.shape[1]))
X_test_3D = X_test_reshaped.reshape((-1, 1000, X_test_reshaped.shape[1]))

In [None]:
# Low Dimensions

X_dev_high_variance = X_dev_high_variance.reshape((-1, 1000, X_dev_high_variance.shape[1]))
X_test_high_variance = X_test_high_variance.reshape((-1, 1000, X_test_high_variance.shape[1]))

## Save Results

In [7]:
processed_data_path = '/content/drive/My Drive/Breast_Cancer_Detection/Processed_Data/'

np.save(processed_data_path + 'X_dev_all_dim.npy', X_dev_3D)
np.save(processed_data_path + 'X_test_all_dim.npy', X_test_3D)

In [None]:
np.save(processed_data_path + 'X_dev_low_dim.npy', X_dev_high_variance)
np.save(processed_data_path + 'X_test_low_dim.npy', X_test_high_variance)