In [None]:
%%capture
!pip install wandb --upgrade

In [None]:
import os
import re
import cv2
import glob
import imageio
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from skimage.transform import resize
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_api")

wandb.login(key=wandb_api)

In [None]:
WORKING_DIR_PATH = '../input/hpa-single-cell-image-classification/'
IMAGE_HEIGHT = 128
IMAGE_WIDTH = 128

In [None]:
# Ref: https://www.kaggle.com/divyanshuusingh/eda-image-segmentation
label_names= {
0: "Nucleoplasm",
1: "Nuclear membrane",
2: "Nucleoli",
3: "Nucleoli fibrillar center",
4: "Nuclear speckles",
5: "Nuclear bodies",
6: "Endoplasmic reticulum",
7: "Golgi apparatus",
8: "Intermediate filaments",
9: "Actin filaments",
10: "Microtubules",
11: "Mitotic spindle",
12: "Centrosome",
13: "Plasma membrane",
14: "Mitochondria",
15: "Aggresome",
16: "Cytosol",
17: "Vesicles and punctate cytosolic patterns",
18: "Negative"
}

In [None]:
df = pd.read_csv(WORKING_DIR_PATH+'train.csv')
# Add a column - num_classes
df['num_classes'] = df['Label'].apply(lambda r: len(r.split('|')))
print(f'Total number of images: {len(df)}')
df.head()

In [None]:
print(df['num_classes'].value_counts())
df['num_classes'].value_counts().plot.bar(title='Examples with multiple labels', xlabel='number of labels per example', ylabel='# train examples')
plt.show()

* First going to create single label dataset for images which have one image-level labels. 

In [None]:
df_one_label = df.loc[df['num_classes'] == 1]
print(f'Number of images with one image-level labels: {len(df_one_label)}')
df_one_label.Label = df_one_label.Label.astype('int64')
df_one_label.head()

In [None]:
# Save as Artifacts

df_one_label.to_csv('train_single.csv', index=True)

run = wandb.init(entity='ayush-thakur', project='hpa', job_type='single_label_dataset')
artifact_raw = run.use_artifact('ayush-thakur/hpa/raw:v0', type='dataset')

artifact = wandb.Artifact('single_label', type='dataset')
artifact.add_file('train_single.csv')
run.log_artifact(artifact)
run.join()

In [None]:
labels, counts = np.unique(df_one_label.Label.values, return_counts=True)
print(f'The unique labels are: {labels} and there values are: {counts}')

plt.figure(figsize=(15,5))
plt.bar(labels, counts)

for index, value in enumerate(counts):
    plt.text(index-0.25, value, str(value), fontdict=dict(fontsize=10))

plt.xticks(np.arange(len(labels)), labels=label_names.values(), rotation=85)
plt.show()

* Since it's going to take more than 9 hours to create this dataset and the Kernel capacity is 9 hrs currently, I am splitting the `df_one_label.csv` into four `csv` files. Each will have roughly 2500 image ids. 

In [None]:
df_splits = np.array_split(df_one_label, 4)

In [None]:
len(df_one_label), len(df_splits[0]), len(df_splits[1]), len(df_splits[2]), len(df_splits[3])

In [None]:
for i, df_split in enumerate(df_splits):
    df_split.to_csv(f'train_single_{i}.csv')

In [None]:
# Save as Artifacts
run = wandb.init(entity='ayush-thakur', project='hpa', job_type='single_label_dataset_split')
artifact_single_label = run.use_artifact('ayush-thakur/hpa/single_label:v1', type='dataset')

artifact = wandb.Artifact('single_label_split', type='dataset')

for i in range(4):
    artifact.add_file(f'train_single_{i}.csv')
    
run.log_artifact(artifact)
run.join()