# Introduction

This Kernel is used to extract width & height for the train and test images.

# Let's do it

Install needed resources, import packages, read the train data.

In [None]:
!pip install imagesize

In [None]:
import numpy as np
import pandas as pd
import os
import imagesize
import time

In [None]:
train_df = pd.read_csv('/kaggle/input/happy-whale-and-dolphin/train.csv')

Fix species names, add family information.

In [None]:
train_df.loc[train_df.species.str.contains('beluga'), 'species'] = 'beluga_whale'
train_df.loc[train_df.species.str.contains('globis'), 'species'] = 'globis_whale'
train_df['family'] = train_df.species.map(lambda x: 'whale' if 'whale' in x else 'dolphin')
train_df['species'] = train_df['species'].str.replace('bottlenose_dolpin','bottlenose_dolphin')
train_df['species'] = train_df['species'].str.replace('kiler_whale','killer_whale')

Define a function that read the image sizes.

In [None]:
def get_image_sizes_imagesize(file_name):
    width, height = imagesize.get('/kaggle/input/happy-whale-and-dolphin/train_images/' + file_name)
    return [width, height]

Apply it for train images.

In [None]:
start_time = time.time()
sample_size = train_df.shape[0]
m = np.stack(train_df['image'].apply(get_image_sizes_imagesize))
df = pd.DataFrame(m,columns=['w','h'])
print(f"Total processing time for {sample_size} images (using imagesize): {round(time.time()-start_time, 2)} sec.")

Merge the image info to the train data.

In [None]:
train_img_df = pd.concat([train_df, df], axis=1, sort=False)
print(f"Number of different image size ( images samples): {train_img_df.groupby(['w','h']).count().shape[0]}")

Prepare a dataset with image names for test set.

In [None]:
test_image_list = list(os.listdir('/kaggle/input/happy-whale-and-dolphin/test_images'))
test_df = pd.DataFrame(test_image_list, columns=["image"])
print(test_df.shape)
test_df.head(2)

Extract image size for test data.

In [None]:
def get_image_sizes_imagesize_test(file_name):
    width, height = imagesize.get('/kaggle/input/happy-whale-and-dolphin/test_images/' + file_name)
    return [width, height]

In [None]:
start_time = time.time()
sample_size = test_df.shape[0]
m = np.stack(test_df['image'].apply(get_image_sizes_imagesize_test))
df = pd.DataFrame(m,columns=['w','h'])
print(f"Total processing time for {sample_size} images from test data (using imagesize): {round(time.time()-start_time, 2)} sec.")

In [None]:
test_img_df = pd.concat([test_df, df], axis=1, sort=False)

Save the train and test data including image width.

In [None]:
train_img_df.to_csv("train_img_size.csv", index=False)
test_img_df.to_csv("test_img_size.csv", index=False)