# Downloading HPA public data

In [None]:
import io
import os
import requests
import pathlib
import gzip
import imageio
import pandas as pd


def tif_gzip_to_png(tif_path):
    '''Function to convert .tif.gz to .png and put it in the same folder
    Eg. for working in local work station
    '''
    png_path = pathlib.Path(tif_path.replace('.tif.gz','.png'))
    tf = gzip.open(tif_path).read()
    img = imageio.imread(tf, 'tiff')
    imageio.imwrite(png_path, img)
    
def download_and_convert_tifgzip_to_png(url, target_path):    
    '''Function to convert .tif.gz to .png and put it in the same folder
    Eg. in Kaggle notebook
    '''
    r = requests.get(url)
    f = io.BytesIO(r.content)
    tf = gzip.open(f).read()
    img = imageio.imread(tf, 'tiff')
    imageio.imwrite(target_path, img)

In [None]:
public_hpa_df = pd.read_csv('../input/publichpa-withcellline/kaggle_2021.csv')
colors = ['blue', 'red', 'green', 'yellow']
celllines = ['A-431', 'A549', 'EFO-21', 'HAP1', 'HEK 293', 'HUVEC TERT2', 'HaCaT', 'HeLa', 'PC-3', 'RH-30', 'RPTEC TERT1', 'SH-SY5Y', 'SK-MEL-30', 'SiHa', 'U-2 OS', 'U-251 MG', 'hTCEpi']
public_hpa_df_17 = public_hpa_df[public_hpa_df.Cellline.isin(celllines)]
len(public_hpa_df), len(public_hpa_df_17)


In [None]:
df = public_hpa_df[public_hpa_df.Label == 'No staining'].sample(n=500, random_state=42).reset_index(drop=True)

In [None]:
df.head()

In [None]:
df['downloaded'] = True

In [None]:
from fastai.vision.all import zipfile

with zipfile.ZipFile('negative_cells.zip', 'w') as img_out:    
    for i, row in df.iterrows():
        try:
            img = row.Image
            for color in colors:
                img_url = f'{img}_{color}.tif.gz'
                fname = f'{os.path.basename(img)}_{color}.png'
                r = requests.get(img_url)
                f = io.BytesIO(r.content)
                tf = gzip.open(f).read()
                # im = imageio.imread(tf, 'tiff')
                # print(type(im))
                img_out.writestr(fname, tf)
                # with open(save_path, 'wb') as f:
                #     f.write(r.content)
                # download_and_convert_tifgzip_to_png(img_url, save_path)
                # print(f'Downloaded {img_url} as {save_path}')    
        except:
            df['downloaded'].loc[i] = False
            print(f'failed to download: {img}')

In [None]:
df.to_csv('df_negative.csv', index=False)

In [None]:
# !unzip negative_cells.zip

# import matplotlib.pyplot as plt
# im = imageio.imread('956_D3_7_green.png', 'tiff')
# plt.imshow(im)