## Installing Libraries

In [None]:
!pip install imagesize

## Importing Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import imagesize
from tqdm.notebook import tqdm


%matplotlib inline

## Reading Data

In [None]:
root_dir = "../input/happy-whale-and-dolphin/"
train_dir = os.path.join(root_dir, "train_images")
train_csv = os.path.join(root_dir, "train.csv")
test_dir = os.path.join(root_dir, "test_images")
test_csv = os.path.join(root_dir, "sample_submission.csv")

In [None]:
train_df = pd.read_csv(train_csv)
train_df["path"] = train_df.image.apply(lambda x: os.path.join(train_dir, x))

test_df = pd.read_csv(test_csv)
test_df["path"] = test_df.image.apply(lambda x: os.path.join(test_dir, x))

In [None]:
print(f"Train Dataset:{len(train_df)} and Test Dataset {len(test_df)}")

In [None]:
# convert beluga to beluga_whale
# convert globis and pilot_whale to short_finned_pilot_whale
train_df.loc[train_df.species.str.contains('beluga'), 'species'] = 'beluga_whale'
train_df.loc[train_df.species.str.contains('globis'), 'species'] = 'short_finned_pilot_whale'
# long_finned_pilot_whale are to be sperate than short_finned_pilot_whale 
train_df['species'] = train_df["species"].replace('pilot_whale', 'short_finned_pilot_whale')

# fix duplicate labels
train_df['species'] = train_df['species'].str.replace('bottlenose_dolpin', 'bottlenose_dolphin')
train_df['species'] = train_df['species'].str.replace('kiler_whale', 'killer_whale')

# adding class column
train_df['class'] = train_df.species.map(lambda x: 'whale' if 'whale' in x else 'dolphin')


In [None]:
def get_image_size(row):
    row['width'], row['height'] = imagesize.get(row['path'])
    return row

tqdm.pandas(desc="Train")
train_df = train_df.progress_apply(get_image_size, axis=1)

tqdm.pandas(desc="Test")
test_df = test_df.progress_apply(get_image_size, axis=1)

In [None]:
train_df

In [None]:
test_df