In [None]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
train = pd.read_csv('../input/happy-whale-and-dolphin/train.csv')
test = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')
import pickle
image_sizes = pickle.load(open('../input/whale-image-eda-image-size/sizes.pkl','rb'))

In [None]:
train_sizes = []
for name, size in zip(image_sizes['train_images']['names'], image_sizes['train_images']['sizes']):
    name = name.replace('../input/happy-whale-and-dolphin/train_images/','')
    train_sizes.append({'image':name, 'width': size[0], 'height': size[1]})
test_sizes = []
for name, size in zip(image_sizes['test_images']['names'], image_sizes['test_images']['sizes']):
    name = name.replace('../input/happy-whale-and-dolphin/test_images/','')
    test_sizes.append({'image':name, 'width': size[0], 'height': size[1]})

In [None]:
train_images = pd.DataFrame(train_sizes)
test_images = pd.DataFrame(test_sizes)

In [None]:
train = train.merge(train_images, on=['image'], how='inner')
test = test.merge(test_images, on=['image'], how='inner')

In [None]:
train

In [None]:
train['group'] = 'train'
train['predictions'] = ''
test['group'] = 'test'
test['species'] = ''
test['individual_id'] = ''
all_df = pd.concat([train,test], axis=0)
all_df['ratio'] = all_df['height']/all_df['width']
all_df

In [None]:
all_df['area'] = all_df['width']*all_df['height']

In [None]:
all_df = all_df.reset_index()

In [None]:
def fix_species_name(species):
    if species in ['bottlenose_dolphin','bottlenose_dolpin']:
        return 'bottlenose_dolphin'
    elif species in ['killer_whale','kiler_whale']:
        return 'kiler_whale'
    else:
        return species
all_df.species = all_df.species.apply(lambda name: fix_species_name(name))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 
p = sns.jointplot("width", "height", all_df, hue='group')
p.fig.suptitle("Train vs Test (Width vs Height)")
p.fig.set_dpi(100)

p = sns.jointplot("width", "ratio", all_df, hue='group')
p.fig.suptitle("Train vs Test (Width vs Ratio)")
p.fig.set_dpi(100)

p = sns.jointplot("height", "ratio", all_df, hue='group')
p.fig.suptitle("Train vs Test (Height vs Ratio)")
p.fig.set_dpi(100)

# Train vs Test images are well overlapped

In [None]:
prev = 0
for max_w in [500,1000,1500,2000,5000]:
    curr = max_w
    p = sns.jointplot("width", "height", all_df[(all_df.width<curr) & (all_df.width>=prev)], hue='group')
    p.fig.suptitle(f"Train vs Test Width:{prev} to {curr}")
    p.fig.set_dpi(100)
    prev = curr

In [None]:
prev = 0
for max_h in [500,1000,1500,2000,5000]:
    curr = max_h
    p = sns.jointplot("width", "height", all_df[(all_df.height<curr) & (all_df.height>=prev)], hue='group')
    p.fig.suptitle(f"Train vs Test Height:{prev} to {curr}")
    p.fig.set_dpi(100)
    prev = curr

# Interesting Data split by height is good option

In [None]:
all_species = set(all_df['species'].values)-set([''])
len(all_species)

In [None]:
for name in all_species:
    p = sns.jointplot("width", "height", all_df[(all_df.species==name)], hue='group')
    p.fig.suptitle(f"{name} distribution")
    p.fig.set_dpi(100)

# **frasiers dolphin are outliers from all other species**

In [None]:
all_df.to_csv('all.csv', index=False)

# Please Upvote if you find this Helpful

# Can we find the source from width/height/species analysis ? 