# Downloading HPA public data

## Update 

**2021/02/26**

I indicated which images are in trainset in the column `in_trainset`. Please remove them so you don't have to download twice.

I have also added a column `Label_idx`. As I mentioned in [Single Cell Patterns](https://www.kaggle.com/lnhtrang/single-cell-patterns) notebook, we merged some classes for this competition. The function to convert from label names to indexes is also added if you want to double check.

In [1]:
import io
import os
import requests
import pathlib
import gzip
import imageio
import pandas as pd
import matplotlib.pyplot as plt


def tif_gzip_to_png(tif_path):
    '''Function to convert .tif.gz to .png and put it in the same folder
    Eg. for working in local work station
    '''
    png_path = pathlib.Path(tif_path.replace('.tif.gz','.png'))
    tf = gzip.open(tif_path).read()
    img = imageio.imread(tf, 'tiff')
    imageio.imwrite(png_path, img)
    
def download_and_convert_tifgzip_to_png(url, target_path):    
    '''Function to convert .tif.gz to .png and put it in the same folder
    Eg. in Kaggle notebook
    '''
    r = requests.get(url)
    f = io.BytesIO(r.content)
    tf = gzip.open(f).read()
    img = imageio.imread(tf, 'tiff')
    imageio.imwrite(target_path, img)
    
def download_and_convert_jpeg_to_png(url, target_path):
    r = requests.get(url)
    f = io.BytesIO(r.content)
    img = imageio.imread(f)
    imageio.imwrite(target_path, img)

In [2]:
# All label names in the public HPA and their corresponding index. 
all_locations = dict({
    "Nucleoplasm": 0,
    "Nuclear membrane": 1,
    "Nucleoli": 2,
    "Nucleoli fibrillar center": 3,
    "Nuclear speckles": 4,
    "Nuclear bodies": 5,
    "Endoplasmic reticulum": 6,
    "Golgi apparatus": 7,
    "Intermediate filaments": 8,
    "Actin filaments": 9,
    "Focal adhesion sites": 9,
    "Microtubules": 10,
    "Mitotic spindle": 11,
    "Centrosome": 12,
    "Centriolar satellite": 12,
    "Plasma membrane": 13,
    "Cell Junctions": 13,
    "Mitochondria": 14,
    "Aggresome": 15,
    "Cytosol": 16,
    "Vesicles": 17,
    "Peroxisomes": 17,
    "Endosomes": 17,
    "Lysosomes": 17,
    "Lipid droplets": 17,
    "Cytoplasmic bodies": 17,
    "No staining": 18
})


def add_label_idx(df, all_locations):
    '''Function to convert label name to index
    '''
    df["Label_idx"] = None
    for i, row in df.iterrows():
        labels = row.Label.split(',')
        idx = []
        for l in labels:
            if l in all_locations.keys():
                idx.append(str(all_locations[l]))
        if len(idx)>0:
            df.loc[i,"Label_idx"] = "|".join(idx)
            
        print(df.loc[i,"Label"], df.loc[i,"Label_idx"])
    return df
    

In [3]:
def sort_target_labels(target):
    labels = sorted(set(int(label) for label in target.split('|')))
    target = '|'.join(str(label) for label in labels)
    return target

In [4]:
public_hpa_df = pd.read_csv('../input/publichpa-withcellline/kaggle_2021.tsv')
# Remove all images overlapping with Training set
public_hpa_df = public_hpa_df[public_hpa_df.in_trainset == False]

# Remove all images with only labels that are not in this competition
public_hpa_df = public_hpa_df[~public_hpa_df.Label_idx.isna()]
public_hpa_df['Label_idx'] = public_hpa_df.Label_idx.apply(sort_target_labels)

colors = ['blue', 'red', 'green', 'yellow']
celllines = ['A-431', 'A549', 'EFO-21', 'HAP1', 'HEK 293', 'HUVEC TERT2', 'HaCaT', 'HeLa', 'PC-3', 'RH-30', 'RPTEC TERT1', 'SH-SY5Y', 'SK-MEL-30', 'SiHa', 'U-2 OS', 'U-251 MG', 'hTCEpi']
public_hpa_df_17 = public_hpa_df[public_hpa_df.Cellline.isin(celllines)]
len(public_hpa_df), len(public_hpa_df_17)


(77668, 67462)

# Gather images for underrepresented labels

In [5]:
def get_label_nsample_summary(df, col_targ='Target', standard_scale=False):
    '''
    For each class label and target length, return the number
    of samples.
    '''
    df = df.copy()
    df['label_list'] = df[col_targ].apply(lambda targ: targ.split('|'))
    df['target_length'] = df.label_list.apply(lambda label_list: len(label_list))
    
    labels = [str(label) for label in range(19)]
    
    summary = pd.concat(
        [df[df.label_list.apply(lambda xs: f'{label}' in xs)].groupby('target_length').count().iloc[:,0]
         for label in labels], 
        axis=1, keys=labels, names='label')
    
    summary.fillna(0, inplace=True)
    
    if standard_scale:
        summary = summary.apply(lambda row: row / (row.max() - row.min()), axis=1)
        
    return summary

In [6]:
get_label_nsample_summary(public_hpa_df, col_targ='Label_idx')

label,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
target_length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,15700,263.0,959,343.0,1355,617.0,1171,1808,295.0,372,506.0,13,616,2285,3770.0,13.0,7810,4627,5456.0
2,15213,587.0,2500,754.0,987,1203.0,646,2068,228.0,680,488.0,95,869,4574,1770.0,94.0,12225,4297,0.0
3,3588,257.0,911,304.0,161,504.0,175,783,54.0,325,121.0,77,465,1761,338.0,30.0,3395,1070,0.0
4,220,50.0,78,22.0,16,46.0,10,49,5.0,39,16.0,6,48,133,22.0,3.0,223,82,0.0
5,10,0.0,2,0.0,2,0.0,2,2,0.0,4,0.0,2,4,8,0.0,0.0,10,4,0.0


In [7]:
public_hpa_df['label_list'] = public_hpa_df['Label_idx'].apply(lambda targ: targ.split('|'))
public_hpa_df['target_length'] = public_hpa_df.label_list.apply(lambda label_list: len(label_list))

In [8]:
def get_underreps():
    '''
    Copied from "humanpro_data_skewedness". 
    e.g. `3:[11]` means that there are fewer than 1000 cell samples that have
    a target/label_idx of length 3 and with the label 11 in it.
    '''
    underreps = {
        1: [11],
        2: [11, 18],
        3: [4, 6, 10, 11, 15, 17, 18],
        4: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18],
        5: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]}
    return {nlabel: [str(label) for label in labels] for nlabel, labels in underreps.items()}

def get_samples_underrep(df, target_length=1, under_labels=['11']):
    '''
    Get samples corresponding to the under-represented classes.
    '''
    df = df[df.target_length==target_length].reset_index(drop=True)
    df = df[df.label_list.apply(lambda ls: any(label in ls for label in under_labels))]
    return df.reset_index(drop=True)

def get_images_to_download(df, underreps):
    df_download = pd.DataFrame()
    for target_length, under_labels in underreps.items():
        df_target_length = get_samples_underrep(df, target_length, under_labels)
        print(f'Target length: {target_length}.  Number of images to download: {len(df_target_length)}')
        df_download = df_download.append(df_target_length, ignore_index=True)
    return df_download

In [9]:
underreps = get_underreps()
df_tograb = get_images_to_download(public_hpa_df, underreps)
len(df_tograb)

Target length: 1.  Number of images to download: 13
Target length: 2.  Number of images to download: 95
Target length: 3.  Number of images to download: 1519
Target length: 4.  Number of images to download: 267
Target length: 5.  Number of images to download: 10


1904

In [10]:
%%time

df_tograb = df_tograb.iloc[:]

save_dir = '/kaggle/publichpa' #os.path.join(os.getcwd(),'publichpa')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
for i, row in df_tograb.iterrows():
    try:
        img = row.Image
        for color in colors:
#             img_url = f'{img}_{color}.tif.gz'
            img_url = f'{img}_{color}.jpg'
            save_path = os.path.join(save_dir,  f'{os.path.basename(img)}_{color}.png')
#             download_and_convert_tifgzip_to_png(img_url, save_path)
            download_and_convert_jpeg_to_png(img_url, save_path)
            print(f'Downloaded {img_url} as {save_path}')    
    except:
        print(f'failed to download: {img}')

Downloaded https://images.proteinatlas.org/31240/1645_E5_33_blue.jpg as /kaggle/publichpa/1645_E5_33_blue.png
Downloaded https://images.proteinatlas.org/31240/1645_E5_33_red.jpg as /kaggle/publichpa/1645_E5_33_red.png
Downloaded https://images.proteinatlas.org/31240/1645_E5_33_green.jpg as /kaggle/publichpa/1645_E5_33_green.png
Downloaded https://images.proteinatlas.org/31240/1645_E5_33_yellow.jpg as /kaggle/publichpa/1645_E5_33_yellow.png
Downloaded https://images.proteinatlas.org/31240/1645_E5_34_blue.jpg as /kaggle/publichpa/1645_E5_34_blue.png
Downloaded https://images.proteinatlas.org/31240/1645_E5_34_red.jpg as /kaggle/publichpa/1645_E5_34_red.png
Downloaded https://images.proteinatlas.org/31240/1645_E5_34_green.jpg as /kaggle/publichpa/1645_E5_34_green.png
Downloaded https://images.proteinatlas.org/31240/1645_E5_34_yellow.jpg as /kaggle/publichpa/1645_E5_34_yellow.png
Downloaded https://images.proteinatlas.org/40766/1773_A5_9_blue.jpg as /kaggle/publichpa/1773_A5_9_blue.png
Down

In [11]:
! du -hs  /kaggle/publichpa/

18G	/kaggle/publichpa/


In [12]:
%cd /kaggle
! zip -rq publichpa.zip publichpa/
! mv publichpa.zip /kaggle/working/.
%cd /kaggle/working

/kaggle
/kaggle/working


In [13]:
(20 / 0.096) * 3 / 60

10.416666666666666

In [14]:
df_tograb.drop(['label_list', 'target_length'], axis=1, inplace=True)
df_tograb.to_feather('publichpa.feather')

In [15]:
# import cv2

# import matplotlib.pyplot as plt
# plt.imshow(cv2.imread('/kaggle/working/publichpa/953_C12_3_red.png'))