# Logistic Regression

A logistic regression model will be used as the baseline.

## Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Data

In [5]:
processed_data_path = '/content/drive/My Drive/Breast_Cancer_Detection/Processed_Data/'

X_dev = np.load(processed_data_path + 'X_dev.npy')
y_dev = np.load(processed_data_path + 'y_dev.npy')

X_test = np.load(processed_data_path + 'X_test.npy')

## Sum features

Each tile has 2048 features. To reduce dimensionality, we will sum all the features together for each tile.

In [6]:
X_dev_reshaped = X_dev.reshape(len(X_dev), 1000, 2048)
X_dev_summed = X_dev_reshaped.sum(axis=2)

X_test_reshaped = X_test.reshape(len(X_test), 1000, 2048)
X_test_summed = X_test_reshaped.sum(axis=2)

print(f'New X_dev shape: {X_dev_summed.shape}')
print(f'New X_test shape: {X_test_summed.shape}')

New X_dev shape: (305, 1000)
New X_test shape: (149, 1000)


## Tile Pooling, Sorting and Averaging

4 different logistic regression models will be trained:


*   All tiles in each sample will be sorted and used as inputs (Sorted)
*   The tiles will be averaged per sample and used as inputs (Mean Pooling)
*   The max tile per sample will be used as inputs (Max Pooling)
*   The top 5 max tiles per sample will be used as inputs (Top 5 Max Pooling)



In [7]:
# Sort features in descending order for each row
X_dev_sorted = np.sort(X_dev_summed, axis=1)[:, ::-1]
X_test_sorted = np.sort(X_test_summed, axis=1)[:, ::-1]

# Mean Pool each row
X_dev_mean = np.mean(X_dev_summed, axis=1)
X_test_mean = np.mean(X_test_summed, axis=1)

# Max Pool each row
X_dev_max = X_dev_sorted[:, 0]
X_test_max = X_test_sorted[:, 0]

# Top 5 Max Pool each row
X_dev_max_5 = X_dev_sorted[:, 0:2]
X_test_max_5 = X_test_sorted[:, 0:2]

## Standarize Data

In [8]:
def X_standardize(X_dev, X_test):

    X_dev_flat = X_dev.flatten()

    feature_mean = np.mean(X_dev_flat)
    feature_std = np.std(X_dev_flat)

    X_dev_scaled = (X_dev - feature_mean) / feature_std
    X_test_scaled = (X_test - feature_mean) / feature_std

    return X_dev_scaled, X_test_scaled

## Logistic Regression

In [9]:
def Logisitc_Regression(X_dev, y_dev, X_test):

    # Scale the data
    X_dev_scaled, X_test_scaled = X_standardize(X_dev, X_test)

    if X_dev_scaled.ndim == 1:
        X_dev_scaled = X_dev_scaled.reshape(-1, 1)
        X_test_scaled = X_test_scaled.reshape(-1, 1)

     # Data splitter for 5 stratified folds
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Initialize empty lists to store AUC scores for validation
    val_auc_scores = []

    # Initialize empty lists to store test predictions
    test_predictions = []

    # Split the data into train and val using KFold
    for train_idx, val_idx in cv.split(X_dev, y_dev):

        # Train and Val Folds
        X_train_fold, X_val_fold = X_dev_scaled[train_idx], X_dev_scaled[val_idx]
        y_train_fold, y_val_fold = y_dev[train_idx], y_dev[val_idx]

        # Create a logistic regression model
        logistic_model = LogisticRegression(C=0.01, solver="liblinear")

        # Fit the model on the training data
        logistic_model.fit(X_train_fold, y_train_fold)

        # Predict the probabilities for validation and test sets
        val_preds = logistic_model.predict_proba(X_val_fold)[:, 1]
        test_preds = logistic_model.predict_proba(X_test_scaled)[:, 1]

        # Calculate AUC for validation set
        val_auc = roc_auc_score(y_val_fold, val_preds)
        val_auc_scores.append(val_auc)

        # Add test predictions
        test_predictions.append(test_preds)

    print(f'Average AUC: {np.mean(val_auc_scores)}')
    print(f'Std AUC: {np.std(val_auc_scores)}')

    average_test_predictions = np.mean(test_predictions, axis=0)

    return average_test_predictions

### All Instances

In [10]:
all_instances_pred = Logisitc_Regression(X_dev_sorted, y_dev, X_test_sorted)

Average AUC: 0.5615293935660068
Std AUC: 0.050283000662414


### Mean Pooling

In [11]:
mean_pred = Logisitc_Regression(X_dev_mean, y_dev, X_test_mean)

Average AUC: 0.5847386023587398
Std AUC: 0.04671089831833901


### Max Pooling

In [12]:
max_pred = Logisitc_Regression(X_dev_max, y_dev, X_test_max)

Average AUC: 0.5491852160044379
Std AUC: 0.04652345850923642


### Top 5 Max Pooling

In [13]:
max_5_pred = Logisitc_Regression(X_dev_max_5, y_dev, X_test_max_5)

Average AUC: 0.5612056136083596
Std AUC: 0.030330855903588597


## Saving Predictions

In [14]:
# Load metadata about each sample
data_path = '/content/drive/My Drive/Breast_Cancer_Detection/Data/'
df_test = pd.read_csv(data_path + "test_metadata.csv")

# Join sample ID metadata with probability prediction
all_tiles_submission = pd.DataFrame( {"Sample ID": df_test["Sample ID"].values, "Target": all_instances_pred}).sort_values("Sample ID")
mean_pred_submission = pd.DataFrame( {"Sample ID": df_test["Sample ID"].values, "Target": mean_pred}).sort_values("Sample ID")
max_pred_submission = pd.DataFrame( {"Sample ID": df_test["Sample ID"].values, "Target": max_pred}).sort_values("Sample ID")
max_5_pred_submission = pd.DataFrame( {"Sample ID": df_test["Sample ID"].values, "Target": max_5_pred}).sort_values("Sample ID")

In [15]:
def sanity_checks(submission):
    assert all(submission["Target"].between(0, 1)), "`Target` values must be in [0, 1]"
    assert submission.shape == (149, 2), "Your submission file must be of shape (149, 2)"
    assert list(submission.columns) == ["Sample ID", "Target",], "Your submission file must have columns `Sample ID` and `Target`"

sanity_checks(all_tiles_submission)
sanity_checks(mean_pred_submission)
sanity_checks(max_pred_submission)
sanity_checks(max_5_pred_submission)

In [16]:
# Save the submission as a csv file

submission_path = '/content/drive/My Drive/Breast_Cancer_Detection/Predictions/'

all_tiles_submission.to_csv(submission_path + "all_tiles_submission.csv", index=None)
mean_pred_submission.to_csv(submission_path + "mean_pred_submission.csv", index=None)
max_pred_submission.to_csv(submission_path + "max_pred_submission.csv", index=None)
max_5_pred_submission.to_csv(submission_path + "max_5_pred_submission.csv", index=None)


all_tiles_submission.head()

Unnamed: 0,Sample ID,Target
0,ID_003.npy,0.355642
1,ID_004.npy,0.352566
2,ID_008.npy,0.362484
3,ID_009.npy,0.230364
4,ID_010.npy,0.393431


In [17]:
print(f'All Tiles mean: {round(np.mean(all_instances_pred),3)}, stdev: {round(np.std(all_instances_pred),3)}')
print(f'Tile Mean mean: {round(np.mean(mean_pred),3)}, stdev: {round(np.std(mean_pred),3)}')
print(f'Max Tile mean: {round(np.mean(max_pred),3)}, stdev: {round(np.std(max_pred),3)}')
print(f'Top 5 tiles mean: {round(np.mean(max_5_pred),3)}, stdev: {round(np.std(max_5_pred),3)}')

All Tiles mean: 0.362, stdev: 0.055
Tile Mean mean: 0.444, stdev: 0.018
Max Tile mean: 0.442, stdev: 0.015
Top 5 tiles mean: 0.432, stdev: 0.023
