In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras import backend as K

path = './drive/MyDrive/cs109b_final_project/'

# Helper Functions

### Checking for Data Leakage

In [None]:
# Checks for leakage between two dataframes
def has_leakage(df_1, df_2, patient_id):
  """
  Returns True if there are patients common to df_1 and df_2

  """
  # Get unique patient ids 
  df_1_patients = set(df_1[patient_id].values)
  df_2_patients = set(df_2[patient_id].values)

  # Get patients common to both dataframes
  common_patients = df_1_patients.intersection(df_2_patients)

  # Return True if common_patients is non empty
  if len(common_patients) > 0:
    return True
  
  return False

### Loss Functions

In [None]:
# Get class frequencies - used to get pos_weights, neg_weights for Weighted loss function
def get_class_frequencies(y_cols):
  """
  Returns positive and negative frequencies for each class.

  Args:
    y_cols (np.array): array of labels, size (num_observations, num_classes) i.e train_generator.labels

  Returns:
    pos_freqs (np.array): array of positive frequencies for each class, size (num_classes)
    neg_freqs (np.array): array of negative frequencies for each class, size (num_classes)

  """
  
  pos_freqs = np.mean(y_cols, axis=0) 
  neg_freqs = 1 - pos_freqs

  return pos_freqs, neg_freqs2

In [None]:
# Use case of get class_frequences on train_relabeled.csv
df = pd.read_csv(path + 'label_data/train_relabeled.csv')
labels = df.drop('Path', axis = 1).values

pos_freqs, neg_freqs = get_class_frequencies(labels)
print(f"Positive Frequencies: {pos_freqs}")
print(f"Negative Frequencies: {neg_freqs}")

Positive Frequencies: [0.08045662 0.0537932  0.19299046 0.05120796 0.23294504 0.10553849
 0.11787666 0.17728641 0.10560946 0.11358821 0.09973945 0.11114491
 0.11526101 0.15459716]
Negative Frequencies: [0.91954338 0.9462068  0.80700954 0.94879204 0.76705496 0.89446151
 0.88212334 0.82271359 0.89439054 0.88641179 0.90026055 0.88885509
 0.88473899 0.84540284]


In [None]:
# Weighted loss function: pos_weights = negative frequencies, neg_weights = positive frequencies (from lecture slides)

def get_weighted_loss(pos_weights, neg_weights, epsilon=1e-7):
  """
  Return weighted loss function given negative weights and positive weights.

  Args:
    pos_weights (np.array): array of positive weights for each class, size (num_classes)
    neg_weights (np.array): array of negative weights for each class, size (num_classes)

  Returns:
    weighted_loss (function): weighted loss function.
  """
  def weighted_loss(y_true, y_pred):
    """
    Return weighted loss value.

    Args:
      y_true (Tensor): Tensor of true labels, size is (num_examples, num classes)
      y_pred (Tensor): Tensor of predicted labels, size is (num_examples, num classes)

    Returns:
      loss (float): overall scalar loss summed across all classes.
    """
    # initialize loss to zero
    loss = 0.0

    for i in range(len(pos_weights)):
      # for each class, add average weighted loss for that class
      loss += -1. * K.mean(
          pos_weights[i] * y_true[:,i] * K.log(y_pred[:,i] + epsilon) + \
          neg_weights[i] * (1 - y_true[:,i]) * K.log(1 - y_pred[:,i] + epsilon)
      )

    return loss

  return weighted_loss

In [None]:
# Use case of get_weighted_loss - both losses should be the same if weighted correctly
session = K.get_session()
with session.as_default() as session:
  pos_weights = neg_freqs
  neg_weights = pos_freqs
  preds_1 = 0.7 * np.ones(labels.shape)
  preds_2 = 0.3 * np.ones(labels.shape)
  loss_func = get_weighted_loss(pos_weights, neg_weights)
  print(f'preds_1 loss: {loss_func(labels, preds_1).eval():.4f}')
  print(f'preds_2 loss: {loss_func(labels, preds_2).eval():.4f}')

preds_1 loss: 2.2919
preds_2 loss: 2.2919
