In [None]:
%%capture
!pip install wandb --upgrade

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_api")

wandb.login(key=wandb_api)

# Raw HPA Dataset

In [None]:
# download RAW dataset csv
run = wandb.init(project='hpa', job_type='consume_raw')
artifact = run.use_artifact('ayush-thakur/hpa/raw:v0', type='dataset')
artifact_dir = artifact.download()
run.finish()

In [None]:
raw_df = pd.read_csv(artifact_dir+'/'+'train.csv')
raw_df.head()

# Raw Single Label Cell Level Dataset

In [None]:
# download RAW dataset csv
run = wandb.init(project='hpa', job_type='consume_single_label_dataset')
artifact = run.use_artifact('ayush-thakur/hpa/single_label_cell_level:v0', type='dataset')
artifact_dir = artifact.download()
run.finish()

In [None]:
!ls ./artifacts/single_label_cell_level:v0

In [None]:
raw_single_label_df = pd.read_csv(artifact_dir+'/'+'single_label_cell_level.csv')
raw_single_label_df.head()

In [None]:
# We can either use protein or rgb directory.
single_label_cell_level_path = '../input/hpa-single-label-cell-level-dataset/single-label-cell-level/rgb'
print(len(os.listdir(single_label_cell_level_path)))
print(len(raw_single_label_df))

In [None]:
file_names = os.listdir(single_label_cell_level_path)
file_names[0]

In [None]:
# Ref: https://www.kaggle.com/divyanshuusingh/eda-image-segmentation
label_names= {
0: "Nucleoplasm",
1: "Nuclear membrane",
2: "Nucleoli",
3: "Nucleoli fibrillar center",
4: "Nuclear speckles",
5: "Nuclear bodies",
6: "Endoplasmic reticulum",
7: "Golgi apparatus",
8: "Intermediate filaments",
9: "Actin filaments",
10: "Microtubules",
11: "Mitotic spindle",
12: "Centrosome",
13: "Plasma membrane",
14: "Mitochondria",
15: "Aggresome",
16: "Cytosol",
17: "Vesicles and punctate cytosolic patterns",
18: "Negative"
}

labels, counts = np.unique(raw_single_label_df.Label.values, return_counts=True)
print(f'The unique labels are: {labels} and there values are: {counts}')

plt.figure(figsize=(15,5))
plt.bar(labels, counts)

for index, value in enumerate(counts):
    plt.text(index-0.25, value, str(value), fontdict=dict(fontsize=10))

plt.xticks(np.arange(len(labels)), labels=label_names.values(), rotation=85)
plt.show()

# Download Extra Pubic Data 

### Raw CSV

In [None]:
run = wandb.init(project='hpa', job_type='consume_public_hpa_dataset')
artifact = run.use_artifact('ayush-thakur/hpa/hpa_public_data:v1', type='dataset')
artifact_dir = artifact.download()
run.finish()

In [None]:
raw_public_df = pd.read_csv(artifact_dir+'/public_hpa.csv')
raw_public_df.head()

### Negative Class

In [None]:
run = wandb.init(project='hpa', job_type='consume_public_hpa_dataset')
artifact = run.use_artifact('ayush-thakur/hpa/negative:v0', type='dataset')
artifact_dir = artifact.download()
run.finish()

In [None]:
negative_df = pd.read_csv(artifact_dir+'/pubic_negative.csv')
negative_df['Image'] = negative_df['Image'].apply(lambda id: id.split('/')[-1])
negative_df.head()

In [None]:
file_names = os.listdir('../input/singlelabelpublicnegative/protein/')

negative_label_df = pd.DataFrame(columns = raw_df.columns)

for i, filename in tqdm(enumerate(file_names)):
    img_id = '_'.join(filename.split('.')[0].split('_')[:-1])
    label = int(negative_df.loc[negative_df.Image == img_id].Label_idx.values[0])
    negative_label_df.loc[i] = [filename, label]

In [None]:
negative_label_df.head()
negative_label_df.to_csv('clean_public_negative.csv')

In [None]:
run = wandb.init(project='hpa', job_type='public_negative')
artifact_ = run.use_artifact('ayush-thakur/hpa/negative:v0', type='dataset')
artifact = wandb.Artifact('negative', type='dataset')
artifact.add_file('clean_public_negative.csv')
run.log_artifact(artifact)
run.join()

### Aggresome

In [None]:
run = wandb.init(project='hpa', job_type='consume_public_hpa_dataset')
artifact = run.use_artifact('ayush-thakur/hpa/aggresome:v0', type='dataset')
artifact_dir = artifact.download()
run.finish()

In [None]:
aggresome_df = pd.read_csv(artifact_dir+'/pubic_aggresome.csv')
aggresome_df['Image'] = aggresome_df['Image'].apply(lambda id: id.split('/')[-1])
aggresome_df.head()

In [None]:
file_names = os.listdir('../input/singlelabelpublicaggresome/rgb')

tmp_df = pd.DataFrame(columns = raw_df.columns)

for i, filename in tqdm(enumerate(file_names)):
    img_id = '_'.join(filename.split('.')[0].split('_')[:-1])
    label = int(aggresome_df.loc[aggresome_df.Image == img_id].Label_idx.values[0])
    tmp_df.loc[i] = [filename, label]

In [None]:
tmp_df.head()
tmp_df.to_csv('clean_public_aggresome.csv')

In [None]:
run = wandb.init(project='hpa', job_type='public_aggresome')
artifact = run.use_artifact('ayush-thakur/hpa/aggresome:v0', type='dataset')
artifact = wandb.Artifact('aggresome', type='dataset')
artifact.add_file('clean_public_aggresome.csv')
run.log_artifact(artifact)
run.join()

### Nucleur Membrane

In [None]:
run = wandb.init(project='hpa', job_type='consume_public_hpa_dataset')
artifact = run.use_artifact('ayush-thakur/hpa/nuclear_membrane:v0', type='dataset')
artifact_dir = artifact.download()
run.finish()

In [None]:
!ls {artifact_dir}

In [None]:
public_df = pd.read_csv(artifact_dir+'/pubic_nuclear_membrane.csv')
public_df['Image'] = public_df['Image'].apply(lambda id: id.split('/')[-1])
public_df.head()

In [None]:
file_names = os.listdir('../input/singlelabelpublicnuclearmembrane/rgb')

tmp_df = pd.DataFrame(columns = raw_df.columns)

for i, filename in tqdm(enumerate(file_names)):
    img_id = '_'.join(filename.split('.')[0].split('_')[:-1])
    label = int(public_df.loc[public_df.Image == img_id].Label_idx.values[0])
    tmp_df.loc[i] = [filename, label]

In [None]:
tmp_df.to_csv('clean_pubic_nuclear_membrane.csv')
tmp_df.head()

In [None]:
run = wandb.init(project='hpa', job_type='public_nuclear_membrane')
artifact = run.use_artifact('ayush-thakur/hpa/nuclear_membrane:v0', type='dataset')
artifact = wandb.Artifact('nuclear_membrane', type='dataset')
artifact.add_file('clean_pubic_nuclear_membrane.csv')
run.log_artifact(artifact)
run.join()

### Mitotic Spindle

In [None]:
run = wandb.init(project='hpa', job_type='consume_public_hpa_dataset')
artifact = run.use_artifact('ayush-thakur/hpa/mitotic_spindle:v0', type='dataset')
artifact_dir = artifact.download()
run.finish()

In [None]:
!ls {artifact_dir}

In [None]:
public_df = pd.read_csv(artifact_dir+'/pubic_mitotic_spindle.csv')
public_df['Image'] = public_df['Image'].apply(lambda id: id.split('/')[-1])
public_df.head()

In [None]:
file_names = os.listdir('../input/singlelabelpublicmitoticspindle/protein')

tmp_df = pd.DataFrame(columns = raw_df.columns)

for i, filename in tqdm(enumerate(file_names)):
    img_id = '_'.join(filename.split('.')[0].split('_')[:-1])
    try:
        label = int(public_df.loc[public_df.Image == img_id].Label_idx.values[0])
        tmp_df.loc[i] = [filename, label]
    except:
        print(filename)

In [None]:
tmp_df.to_csv('clean_pubic_mitotic_spindle.csv')
tmp_df.head()

In [None]:
run = wandb.init(project='hpa', job_type='public_mitotic_spindle')
artifact = run.use_artifact('ayush-thakur/hpa/mitotic_spindle:v0', type='dataset')
artifact = wandb.Artifact('mitotic_spindle', type='dataset')
artifact.add_file('clean_pubic_mitotic_spindle.csv')
run.log_artifact(artifact)
run.join()

### Actin Filament

In [None]:
run = wandb.init(project='hpa', job_type='consume_public_hpa_dataset')
artifact = run.use_artifact('ayush-thakur/hpa/actin_filaments:v0', type='dataset')
artifact_dir = artifact.download()
run.finish()

In [None]:
!ls {artifact_dir}

In [None]:
public_df = pd.read_csv(artifact_dir+'/pubic_actin_filaments.csv')
public_df['Image'] = public_df['Image'].apply(lambda id: id.split('/')[-1])
public_df.head()

In [None]:
file_names = os.listdir('../input/singlelabelpublicactinfilaments/rgb')

tmp_df = pd.DataFrame(columns = raw_df.columns)

for i, filename in tqdm(enumerate(file_names)):
    img_id = '_'.join(filename.split('.')[0].split('_')[:-1])
    try:
        label = int(public_df.loc[public_df.Image == img_id].Label_idx.values[0])
        tmp_df.loc[i] = [filename, label]
    except:
        print(filename)

In [None]:
tmp_df.to_csv('clean_pubic_actin_filaments.csv')
tmp_df.head()

In [None]:
run = wandb.init(project='hpa', job_type='public_actin_filaments')
artifact = run.use_artifact('ayush-thakur/hpa/actin_filaments:v0', type='dataset')
artifact = wandb.Artifact('actin_filaments', type='dataset')
artifact.add_file('clean_pubic_actin_filaments.csv')
run.log_artifact(artifact)
run.join()

# Merge All The Data (HuHa)

In [None]:
raw_df.head()

In [None]:
raw_single_label_df.head()

In [None]:
nuclear_membrane_df = pd.read_csv('./clean_pubic_nuclear_membrane.csv')
mitotic_spindle_df = pd.read_csv('./clean_pubic_mitotic_spindle.csv')
actin_fragment_df = pd.read_csv('./clean_pubic_actin_filaments.csv')
aggresome_df = pd.read_csv('./clean_public_aggresome.csv')
negative_df = pd.read_csv('./clean_public_negative.csv')

In [None]:
negative_df.head()

In [None]:
dfs = [raw_single_label_df, 
       nuclear_membrane_df, 
       mitotic_spindle_df, 
       actin_fragment_df,
       aggresome_df,
       negative_df]

In [None]:
single_label_cell_level_df = pd.concat(dfs, ignore_index=True)
single_label_cell_level_df = single_label_cell_level_df[['ID', 'Label']]
single_label_cell_level_df.head(20)

In [None]:
single_label_cell_level_df = single_label_cell_level_df.sample(frac=1).reset_index(drop=True)
single_label_cell_level_df.head(20)

In [None]:
# Ref: https://www.kaggle.com/divyanshuusingh/eda-image-segmentation
label_names= {
0: "Nucleoplasm",
1: "Nuclear membrane",
2: "Nucleoli",
3: "Nucleoli fibrillar center",
4: "Nuclear speckles",
5: "Nuclear bodies",
6: "Endoplasmic reticulum",
7: "Golgi apparatus",
8: "Intermediate filaments",
9: "Actin filaments",
10: "Microtubules",
11: "Mitotic spindle",
12: "Centrosome",
13: "Plasma membrane",
14: "Mitochondria",
15: "Aggresome",
16: "Cytosol",
17: "Vesicles and punctate cytosolic patterns",
18: "Negative"
}

labels, counts = np.unique(single_label_cell_level_df.Label.values, return_counts=True)
print(f'The unique labels are: {labels} and there values are: {counts}')

plt.figure(figsize=(15,5))
plt.bar(labels, counts)

for index, value in enumerate(counts):
    plt.text(index-0.25, value, str(value), fontdict=dict(fontsize=10))

plt.xticks(np.arange(len(labels)), labels=label_names.values(), rotation=85)
plt.show()

labels, counts = np.unique(raw_single_label_df.Label.values, return_counts=True)
print(f'The unique labels are: {labels} and there values are: {counts}')

plt.figure(figsize=(15,5))
plt.bar(labels, counts)

for index, value in enumerate(counts):
    plt.text(index-0.25, value, str(value), fontdict=dict(fontsize=10))

plt.xticks(np.arange(len(labels)), labels=label_names.values(), rotation=85)
plt.show()

In [None]:
single_label_cell_level_df.to_csv('single_label_cell_level_full_dataset.csv')

In [None]:
run = wandb.init(project='hpa', job_type='slcl_full_dataset_creation')

_ = run.use_artifact('ayush-thakur/hpa/single_label_cell_level:v0', type='dataset')
_ = run.use_artifact('ayush-thakur/hpa/negative:v1', type='dataset')
_ = run.use_artifact('ayush-thakur/hpa/aggresome:v1', type='dataset')
_ = run.use_artifact('ayush-thakur/hpa/actin_filaments:v1', type='dataset')
_ = run.use_artifact('ayush-thakur/hpa/mitotic_spindle:v1', type='dataset')
_ = run.use_artifact('ayush-thakur/hpa/nuclear_membrane:v1', type='dataset')

artifact = wandb.Artifact('slcl_full_dataset', type='dataset')
artifact.add_file('single_label_cell_level_full_dataset.csv')
run.log_artifact(artifact)
run.join()