> Resize images for classification task

References : 

- [https://www.kaggle.com/xhlulu/siim-covid-19-convert-to-jpg-256px](https://www.kaggle.com/xhlulu/siim-covid-19-convert-to-jpg-256px)
- [https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way](https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way)

train.tar.gz consists of resized training images under respective label folders
  - negative
  - typical
  - atypical
  - indeterminate
  

<link href="https://fonts.googleapis.com/css?family=Merriweather:300,300i,400,400i,700,700i,900,900i" rel='stylesheet' >
<link href="https://fonts.googleapis.com/css?family=Source+Sans+Pro:300,300i,400,400i,700,700i" rel='stylesheet' >
<link href='http://fonts.googleapis.com/css?family=Source+Code+Pro:300,400' rel='stylesheet' >
<style>
@font-face {
    font-family: "Computer Modern";
    src: url('http://mirrors.ctan.org/fonts/cm-unicode/fonts/otf/cmunss.otf');
}
</style>

In [None]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import plotly.io as pio
from plotly.subplots import make_subplots
# setting default template to plotly_white for all visualizations
pio.templates.default = "plotly_white"
%matplotlib inline
import gc

from colorama import Fore, Back, Style

y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
res = Style.RESET_ALL

import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install python-gdcm

In [None]:
image_df = pd.read_csv('/kaggle/input/siim-covid19-detection/train_image_level.csv', index_col=None)
study_df = pd.read_csv('/kaggle/input/siim-covid19-detection/train_study_level.csv', index_col=None)
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None)
image_df.shape, study_df.shape

In [None]:
import os
all_files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        all_files.append(os.path.join(dirname, filename))

In [None]:
train_files = [file for file in all_files if '/train/' in file]
test_files = [file for file in all_files if '/test/' in file] 

In [None]:
len(train_files),len(test_files)

In [None]:
from pydicom import read_file

def show_image(img, figsize=None, ax=None, cmap="gray"):
    if not ax: 
        fig, ax = plt.subplots(figsize=figsize)
    ax.imshow(img, cmap=cmap)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

def get_image(file):
    dicom = read_file(file, stop_before_pixels=False)
    return dicom.pixel_array    

def get_dicom(file):
    dicom = read_file(file, stop_before_pixels=False)
    return dicom

In [None]:
image_df

In [None]:
def find_file(img_id):
    imgs = [file for file in train_files if img_id in file]
    return imgs[0]
image_df['img_id'] = image_df['id'].apply(lambda x: x.split('_')[0])
image_df['file'] = image_df['img_id'].apply(lambda x : find_file(x))

In [None]:
study_grp = pd.melt(study_df, id_vars=list(study_df.columns)[:1], value_vars=list(study_df.columns)[1:],
             var_name='label', value_name='value')
study_grp = study_grp.loc[study_grp['value']!=0]
lbl_map = {'Negative for Pneumonia' : 'negative', 'Typical Appearance' : 'typical',
       'Indeterminate Appearance' : 'indeterminate', 'Atypical Appearance' : 'atypical'}

study_grp['StudyInstanceUID'] = study_grp['id'].apply(lambda x: x.split('_')[0])
study_grp['label'] = study_grp['label'].apply(lambda x: lbl_map[x])
study_map = dict(zip(study_grp.StudyInstanceUID, study_grp.label))

In [None]:
img_grp = image_df.groupby(['StudyInstanceUID'])['id'].count().reset_index()
img_grp

In [None]:
image_df.head()

In [None]:
image_df['class'] = image_df['StudyInstanceUID'].apply(lambda x: study_map[x])

In [None]:
image_df.head()

In [None]:
from tqdm import tqdm
import cv2 
from pydicom.pixel_data_handlers.util import apply_voi_lut

def resize_imagev_v0(img, image_size=(512,512)):
    img = cv2.resize(img, image_size)
    return img

def resize_image(dicom, image_size=(512,512)):
    img = apply_voi_lut(dicom.pixel_array, dicom)
    if dicom.PhotometricInterpretation == "MONOCHROME1":
        img = np.amax(img) - img
    img = img - np.min(img)
    img = img / np.max(img)
    img = (img * 255).astype(np.uint8)    
    img = cv2.resize(img, image_size)   
    return img

print('Resizing train Images')
BASE_PATH = '/kaggle/out_dir/train'
os.makedirs(BASE_PATH, exist_ok=True)
for lbl in list(image_df['class'].unique()):
    os.makedirs(BASE_PATH + '/' + lbl, exist_ok=True)
    
trn_files_dict = image_df[['img_id','file','StudyInstanceUID','class']].set_index('img_id').T.to_dict()
for iid, rec in tqdm(trn_files_dict.items()):
    out_file = BASE_PATH + '/' + rec['class'] + '/' + iid + '.jpg'
    img = resize_image(get_dicom(rec['file']))
    cv2.imwrite(out_file, img)


In [None]:
print('Resizing test Images')
TEST_PATH = '/kaggle/out_dir/test'
os.makedirs(TEST_PATH, exist_ok=True)
for file in tqdm(test_files):
    out_file = TEST_PATH + '/' + file.split('/')[-1].split('.')[0] + '.jpg'
    img = resize_image(get_dicom(file))
    cv2.imwrite(out_file, img)

In [None]:
!tar -zcf train.tar.gz -C "/kaggle/out_dir/train/" .
!tar -zcf test.tar.gz -C "/kaggle/out_dir/test/" .