# RANCR CLiP EDA-Class Imbalance, Patient Overlap


### Competition : **RANZCR CLiP - Catheter and Line Position Challenge**


Outline : 

* Class Count Plot 
* Patient Overlap 
* Class Imbalance 
* Weighted Loss

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import os
import torch 
from plotly.offline import iplot

pd.options.plotting.backend = "plotly"

# Class Count Plot

In [None]:
data = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train.csv')
data.head()

In [None]:
data.iloc[:,1:12].sum().plot.barh()

# Patient Overlap 

Patient Overlap is the case when train dataset and valid dataset having same patient examples. Simply intersection of train patient and valid patient.

To Avoid Patient Overlap :
* Either drop overlapped data from train dataset or valid dataset 
* Either Add overlapped data from train dataset to valid dataset or valid dataset to train dataset

In [None]:
print("No. of unique patient : {}".format(len(set(data['PatientID']))))

In [None]:
def get_split(data):
    train_df = data.sample(frac = 0.985)
    valid_df = data.drop(train_df.index)
    return train_df, valid_df

In [None]:
train_data,valid_data = get_split(data)
print("Size of train set : {}".format(len(train_data)))
print("Size of valid set : {}".format(len(valid_data)))

In [None]:
def check_for_leakage(df1, df2, patient_col):
    """
    Return True if there any patients are in both df1 and df2.

    Args:
        df1 (dataframe): dataframe describing first dataset
        df2 (dataframe): dataframe describing second dataset
        patient_col (str): string name of column with patient IDs
    
    Returns:
        leakage (bool): True if there is leakage, otherwise False
    """
    
    df1_patients_unique = set(df1[patient_col].values)
    df2_patients_unique = set(df2[patient_col].values)
    
    patients_in_both_groups = df1_patients_unique.intersection(df2_patients_unique)

    leakage = len(patients_in_both_groups) > 0 
    
    return leakage

In [None]:
print("Are patient overlapping ? : {}".format(check_for_leakage(train_data,valid_data,'PatientID')))

In [None]:
ids_train = train_data.PatientID.values
ids_valid = valid_data.PatientID.values

ids_train_set = set(ids_train)
ids_valid_set = set(ids_valid)

print("Unique patient Id in train set : {}".format(len(ids_train_set)))
print("Unique patient Id in valid set : {}".format(len(ids_valid_set)))

In [None]:
patient_overlap = list(ids_train_set.intersection(ids_valid_set))

n_overlap = len(patient_overlap)
print("No. of Patient IDs in both train and valid set : {}".format(n_overlap))

In [None]:
train_overlap_idxs = []
valid_overlap_idxs = []
for idx in range(n_overlap):
    train_overlap_idxs.extend(train_data.index[train_data['PatientID'] == patient_overlap[idx]].tolist())
    valid_overlap_idxs.extend(valid_data.index[valid_data['PatientID'] == patient_overlap[idx]].tolist())

In [None]:
add_to_valid = train_data.loc[train_overlap_idxs]
valid_data = valid_data.append(add_to_valid)

In [None]:
train_data = train_data.drop(train_overlap_idxs)

In [None]:
print("Size of train set : {}".format(len(train_data)))
print("Size of valid set : {}".format(len(valid_data)))

In [None]:
valid_data.iloc[:,1:12].sum().plot.barh()

In [None]:
train_data.iloc[:,1:12].sum().plot.barh()

# Class Imbalance 

In [None]:
def compute_class_freqs(labels):
    
    labels = np.array(labels)
    
    N = labels.shape[0]
    
    positive_frequencies = np.sum(labels,axis = 0) / N
    negative_frequencies = 1 - positive_frequencies
    
    return positive_frequencies, negative_frequencies

In [None]:
freq_pos, freq_neg = compute_class_freqs(train_data.iloc[:,1:12])

In [None]:
class_names = train_data.iloc[:,1:12].columns

In [None]:
df = pd.DataFrame({"Class": class_names, "Label": "Positive", "Value": freq_pos})
df = df.append([{"Class": class_names[l], "Label": "Negative", "Value": v} for l,v in enumerate(freq_neg)], ignore_index=True)

trace1 = go.Bar(
                x = df.iloc[0:11,:].Class,
                y = df.iloc[0:11,:].Value,
                name = "postive",
                marker = dict(color = 'rgba(0,0,255, 0.7)',
                             line=dict(color='rgb(0,0,0)',width=1.5)),
                text = df.iloc[0:11,:].Label)

trace2 = go.Bar(
                x = df.iloc[11:,:].Class,
                y = df.iloc[11:,:].Value,
                name = "negative",
                marker = dict(color = 'rgba(255,0,0, 0.7)',
                             line=dict(color='rgb(0,0,0)',width=1.5)),
                text = df.iloc[11:,:].Label)


data = [trace1,trace2]
layout = go.Layout(barmode = "group")
fig = go.Figure(data = data, layout = layout)
iplot(fig)

Contributions of positive cases is significantly lower than that of the negative ones. However, we want the contributions to be equal. One way of doing this is by multiplying each example from each class by a class-specific weight factor, $pos_{weights}$ and $neg_{weights}$, so that the overall contribution of each class is the same.

To have this, we want

$$pos_{weights} \times freq_{p} = neg_{weights} \times freq_{n},$$
which we can do simply by taking

$$pos_{weights} = freq_{neg}$$$$neg_{weights} = freq_{pos}$$
This way, we will be balancing the contribution of positive and negative labels.

In [None]:
pos_weights = freq_neg
neg_weights = freq_pos
pos_contribution = freq_pos * pos_weights 
neg_contribution = freq_neg * neg_weights

In [None]:
df = pd.DataFrame({"Class": class_names, "Label": "Positive", "Value": pos_contribution})
df = df.append([{"Class": class_names[l], "Label": "Negative", "Value": v} for l,v in enumerate(neg_contribution)], ignore_index=True)

trace1 = go.Bar(
                x = df.iloc[0:11,:].Class,
                y = df.iloc[0:11,:].Value,
                name = "postive",
                marker = dict(color = 'rgba(0,0,255, 0.7)',
                             line=dict(color='rgb(0,0,0)',width=1.5)),
                text = df.iloc[0:11,:].Label)

trace2 = go.Bar(
                x = df.iloc[11:,:].Class,
                y = df.iloc[11:,:].Value,
                name = "negative",
                marker = dict(color = 'rgba(255,0,0, 0.7)',
                             line=dict(color='rgb(0,0,0)',width=1.5)),
                text = df.iloc[11:,:].Label)


data = [trace1,trace2]
layout = go.Layout(barmode = "group")
fig = go.Figure(data = data, layout = layout)
iplot(fig)

# Weighted Loss

To Avoid Data imbalance it is good to use weighted loss.

$$\mathcal{L}_{cross-entropy}(x_i) = -(y_i \log(f(x_i)) + (1-y_i) \log(1-f(x_i))),$$

$$\mathcal{L}_{cross-entropy}^{w}(x) = - (pos_{weights} y \log(f(x)) + neg_{weights}(1-y) \log( 1 - f(x) ) ).$$

In [None]:
def weighted_loss(pos_weights,neg_weights,y_pred,y_true,epsilon = 1e-7):
    
    """
    Return True if there any patients are in both df1 and df2.

    Args:
        pos_weights: Negative Frequency of Train Dataset 
        neg_weights: Positive Frequency of Train Dataset 
        y_pred : model output
        y_true : ground truth
    
    Returns:
        loss : weighted cross-entropy loss
    """
    
    loss = 0.0
    for i in range(len(pos_weights)):
        loss_pos = -1 * torch.mean(pos_weights[i] * y_true[:,i] * torch.log(y_pred[:,i] + epsilon))
        loss_neg = -1 * torch.mean(neg_weights[i] * (1-y_true[:,i]) * torch.log((1-y_pred[:,i]) + epsilon))
        loss += loss_pos + loss_neg
        
    return loss