# Data Skewedeness

In [1]:
! rsync -a /kaggle/input/mmdetection-v280/mmdetection /
! pip install /kaggle/input/mmdetection-v280/src/mmpycocotools-12.0.3/mmpycocotools-12.0.3/
! pip install /kaggle/input/hpapytorchzoo/pytorch_zoo-master/
! pip install /kaggle/input/hpacellsegmentation/HPA-Cell-Segmentation/
! pip install /kaggle/input/iterative-stratification/iterative-stratification-master/

! cp -r /kaggle/input/kgl-humanprotein-data/kgl_humanprotein_data /
! cp -r /kaggle/input/humanpro/kgl_humanprotein /

import sys
sys.path.append('/kgl_humanprotein/')

Processing /kaggle/input/mmdetection-v280/src/mmpycocotools-12.0.3/mmpycocotools-12.0.3
Building wheels for collected packages: mmpycocotools
  Building wheel for mmpycocotools (setup.py) ... [?25l- \ | / - \ | / - done
[?25h  Created wheel for mmpycocotools: filename=mmpycocotools-12.0.3-cp37-cp37m-linux_x86_64.whl size=272908 sha256=44d8993e8b6d4bace55a7908b24bd635fe48df52c3cfc13d6c99731be5c13b12
  Stored in directory: /root/.cache/pip/wheels/80/e0/da/3288fdf3965b5c9090f368462db9d28be2c82013f51821090a
Successfully built mmpycocotools
Installing collected packages: mmpycocotools
Successfully installed mmpycocotools-12.0.3
Processing /kaggle/input/hpapytorchzoo/pytorch_zoo-master
Building wheels for collected packages: pytorch-zoo
  Building wheel for pytorch-zoo (setup.py) ... [?25l- \ done
[?25h  Created wheel for pytorch-zoo: filename=pytorch_zoo-0.0.0-py3-none-any.whl size=30139 sha256=dc9671a2eeb85e8e7fbb7ace2ff49f14d982dc12c14744a89dd27345d688

In [2]:
import os
import time
from pathlib import Path
import shutil
import zipfile
import functools
import multiprocessing
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import KFold,StratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import torch
from torch.backends import cudnn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import DataParallel
import matplotlib.pyplot as plt
from tqdm import tqdm

from kgl_humanprotein.utils.common_util import *
from kgl_humanprotein.config.config import *
from kgl_humanprotein.data_process import *
from kgl_humanprotein.datasets.tool import image_to_tensor
from kgl_humanprotein.networks.imageclsnet import init_network
from kgl_humanprotein.layers.loss import *
from kgl_humanprotein.layers.scheduler import *
from kgl_humanprotein.utils.augment_util import train_multi_augment2
from kgl_humanprotein.utils.log_util import Logger
from kgl_humanprotein.run.train import *

run on 1ea06715a297


In [3]:
%cd /kaggle

/kaggle


In [4]:
dir_data = Path('/kaggle/input')
dir_mdata = Path('/kaggle/mdata')

# Data

In [5]:
def load_subset5_raw():
    pth = Path('/kaggle/input/humanpro-train-cells-subset5/humanpro_train_cells_subset5/train/train.feather')
    df = pd.read_feather(pth)
    df['subset'] = 5
    return df

def load_subsets_raw():
    df_0to4 = pd.read_feather('/kaggle/input/humanpro-raw-meta-channel-max/train.feather')    
    df_5 = load_subset5_raw()
    df = pd.concat([df_0to4, df_5], axis=0, ignore_index=True)
    return df

def load_pseudo_raw():
    return pd.read_feather('/kaggle/input/humanpro-data-multilabel-cells-meta/train.feather')    

def load_soft_pseudo_multi_labels(neg=False):
    '''
    `neg` True will return non-zero pseudo-label for the "Negative" class.
    '''
    pth = f"/kaggle/input/humanpro-data-soft-pseudolabel/df_cells_softlabel{'_neg' if neg else ''}.feather"
    return pd.read_feather(pth)

def seperate_single_multi_labels(df):
    is_singlelabel = df.Target.apply(lambda o: len(o.split('|')) == 1)
    return df[is_singlelabel], df[~is_singlelabel]

def replace_multi_with_pseudo(df_orig_multi, df_pseudo):
    df = pd.merge(df_orig_multi, df_pseudo[['Id', 'Target']], left_on='Id', right_on='Id', how='inner')
    df.rename({'Target_x':'original_target', 'Target_y':'Target'}, axis=1, inplace=True)
    return df

def sort_target_labels(target):
    labels = sorted(set(int(label) for label in target.split('|')))
    target = '|'.join(str(label) for label in labels)
    return target


def get_label_nsample_summary(df, col_targ='Target', standard_scale=False):
    '''
    For each class label and target length, return the number
    of samples.
    '''
    df = df.copy()
    df['label_list'] = df[col_targ].apply(lambda targ: targ.split('|'))
    df['target_length'] = df.label_list.apply(lambda label_list: len(label_list))
    
    labels = [str(label) for label in range(19)]
    
    summary = pd.concat(
        [df[df.label_list.apply(lambda xs: f'{label}' in xs)].groupby('target_length').count().iloc[:,0]
         for label in labels], 
        axis=1, keys=labels, names='label')
    
    summary.fillna(0, inplace=True)
    
    if standard_scale:
        summary = summary.apply(lambda row: row / (row.max() - row.min()), axis=1)
        
    return summary

In [6]:
%%time

df_orig = load_subsets_raw()
df_orig['Target'] = df_orig.Target.apply(sort_target_labels)

df_orig.shape

CPU times: user 5.28 s, sys: 6.39 s, total: 11.7 s
Wall time: 18.7 s


(526066, 9)

# Identify under-represented classes at each target length

In [7]:
def get_label_nsample_summary(df, col_targ='Target', standard_scale=False):
    '''
    For each class label and target length, return the number
    of samples.
    '''
    df = df.copy()
    df['label_list'] = df[col_targ].apply(lambda targ: targ.split('|'))
    df['target_length'] = df.label_list.apply(lambda label_list: len(label_list))
    
    labels = [str(label) for label in range(19)]
    
    summary = pd.concat(
        [df[df.label_list.apply(lambda xs: f'{label}' in xs)].groupby('target_length').count().iloc[:,0]
         for label in labels], 
        axis=1, keys=labels, names='label')
    
    summary.fillna(0, inplace=True)
    
    if standard_scale:
        summary = summary.apply(lambda row: row / (row.max() - row.min()), axis=1)
        
    return summary

In [8]:
label_summary = get_label_nsample_summary(df_orig, col_targ='Target',)

In [9]:
label_summary

label,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
target_length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,37472,4845.0,12672,12882,17527,15337,10198.0,18825,11194.0,5400,7789.0,172,13952,22395,27494,2275.0,22738,6130,952.0
2,120356,11553.0,27083,10412,10598,18921,4479.0,18181,6109.0,7679,6106.0,3188,20693,30686,15909,3187.0,68767,4255,0.0
3,52750,6671.0,15999,5006,4311,8577,4170.0,10433,2405.0,4095,3878.0,3083,8010,19875,5011,1639.0,41741,22507,0.0
4,7135,1993.0,3205,873,274,1078,345.0,1674,620.0,1115,223.0,94,1438,3717,486,122.0,6639,1621,0.0
5,186,0.0,34,32,63,32,0.0,34,0.0,57,0.0,63,34,120,32,0.0,186,57,0.0


1. At each target length, some labels have fewer samples than others.
2. Images can be looked for that contain the under-represented labels at this target length, to boost their numbers.
3. This might also boost the number of samples for labels that are already in abundance, so there's no guarantee that this will reduce skewedness at this target length.  

In [10]:
underreps = {i: list(np.where(r.values < 1_000)[0]) for i, r in label_summary.iterrows()}
underreps

{1: [11, 18],
 2: [18],
 3: [18],
 4: [3, 4, 6, 8, 10, 11, 14, 15, 18],
 5: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]}