In [None]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
from PIL import Image
from tqdm import tqdm

In [None]:
if not os.path.exists('/kaggle/working/cropped_imgs/'):
    os.mkdir('/kaggle/working/cropped_imgs/')

In [None]:
data_folder_path = "../input/hygieia-csv"
csvs_out_path = "./"
img_folder_path = "../input/rsna-pneumonia-detection-2018/input/images"
img_crop_folder_path = "./cropped_imgs"
files = ['solution_train.csv', 'solution_valid.csv', 'solution_test.csv']

In [None]:
metadata = pd.read_csv(os.path.join(data_folder_path, "image_bbox_full.csv"))

In [None]:
solutions = [ pd.read_csv(os.path.join(data_folder_path, name), index_col = 0)['patientId'] for name in files]

In [None]:
[len(x) for x in solutions]

In [None]:
solution_merged = pd.concat(solutions)

In [None]:
solution_merged

In [None]:
metadata = metadata.merge(solution_merged, on='patientId')

In [None]:
metadata = metadata.drop_duplicates(subset='patientId')

## Compute box center

In [None]:
def compute_box_center(box_df):
    box_df['xcenter'] = (box_df['x'] + box_df['width']/2)
    box_df['ycenter'] = (box_df['y'] + box_df['height']/2)
    return box_df

In [None]:
metadata = compute_box_center(metadata)

## Separate patients by class

### Pneumonia patients

In [None]:
#Get only the bounding boxes patients
df_positive = metadata.dropna()
df_positive.head()

In [None]:
#Get only 1 bounding box per patients
df_positive = df_positive.drop_duplicates('patientId')
df_positive.head()

In [None]:
len(df_positive)

### Non pneumonia patients

In [None]:
# Getting Target == 0
df_negative = metadata[metadata['Target'] == 0]
df_negative.head()

In [None]:
len(df_negative)

In [None]:
metadata['new_path'] = np.nan

In [None]:
metadata.loc[metadata['patientId'] == '000924cf-0f8d-42bd-9158-1af53881a557', 'new_path'] = 0

In [None]:
metadata

## Crop positive images

In [None]:
def getBox(xcenter, ycenter, img_size=100):
    crop_size = img_size/2
    return [xcenter-crop_size, ycenter-crop_size, xcenter+crop_size, ycenter+crop_size]

In [None]:
i = 1
for x in df_positive.iterrows():
    
    xcenter = int(x[1]['xcenter'])
    ycenter = int(x[1]['ycenter'])
    
    #Save new name in df metadata
    metadata.loc[metadata['patientId'] == x[1]['patientId'], 'new_path'] = f'{i}.jpg'
    
    img = Image.open(os.path.join(img_folder_path, x[1]['patientId'] + ".jpg"), mode='r')
    img = img.crop(getBox(xcenter, ycenter))
    img.save(os.path.join(img_crop_folder_path, f'{i}.jpg'))
    i += 1
    
#     print(x[1]['xcenter'], x[1]['ycenter'])
#     print(getBox(x[1]['xcenter'],x[1]['ycenter']))
#     print(x[1]['patientId'])
#     print(img.size)
#     break

## Crop negative images

In [None]:
# Seed for reproducibility of df.sample(1)
np.random.seed(42)

for patient_id in df_negative['patientId']:
    
    random_sample = df_positive.sample(1)
    
    xcenter = int(random_sample['xcenter'])
    ycenter = int(random_sample['ycenter'])
    
    #Save new name in df metadata
    metadata.loc[metadata['patientId'] == patient_id, 'new_path'] = f'{i}.jpg'
    
    img = Image.open(os.path.join(img_folder_path, patient_id + ".jpg"), mode='r')
    img = img.crop(getBox(xcenter, ycenter))
    img.save(os.path.join(img_crop_folder_path, f'{i}.jpg'))
    i += 1
    
#     print(patient_id)
#     print(xcenter, ycenter)
#     print(getBox(xcenter, ycenter))
#     print(img.size)
#     break


## Get CSV

In [None]:
os.listdir("./cropped_imgs")[0:5]

In [None]:
metadata['new_path'].isnull().sum()

In [None]:
#Train
metadata[metadata['patientId'].isin(solutions[0])].to_csv(os.path.join(csvs_out_path, "metadata_train.csv"), index=False)

#Valid
metadata[metadata['patientId'].isin(solutions[1])].to_csv(os.path.join(csvs_out_path, "metadata_valid.csv"), index=False)

#Test
metadata[metadata['patientId'].isin(solutions[2])].to_csv(os.path.join(csvs_out_path, "metadata_test.csv"), index=False)

#Full
metadata.to_csv(os.path.join(csvs_out_path, "metadata_full.csv"))

## Zip cropped images

In [None]:
import shutil
shutil.make_archive(img_crop_folder_path, 'zip', csvs_out_path)

In [None]:
print("2")