In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import random
import pandas as pd
from skimage import io
from joblib import Parallel, delayed
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GroupKFold

import numpy as np

In [None]:
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
#     torch.manual_seed(seed_value) # cpu  vars    
#     if torch.cuda.is_available(): 
#         torch.cuda.manual_seed(seed_value)
#         torch.cuda.manual_seed_all(seed_value) # gpu vars
#     if torch.backends.cudnn.is_available:
#         torch.backends.cudnn.deterministic = True
#         torch.backends.cudnn.benchmark = False
    print('# SEEDING DONE')
seed_all(42)

In [None]:
base_path = '/kaggle/input/happy-whale-and-dolphin'

In [None]:
df = pd.read_csv(f'{base_path}/train.csv')
# df['image_path'] = base_path+'/train_images/'+df['image']
# df['split'] = 'Train'

test_df = pd.read_csv(f'{base_path}/sample_submission.csv')
# test_df['image_path'] = base_path+'/test_images/'+test_df['image']
# test_df['split'] = 'Test'

print('Train Images: {:,} | Test Images: {:,}'.format(len(df), len(test_df)))

In [None]:
df.head()

In [None]:
df.species.unique()

In [None]:
len(list(df.individual_id.unique()))

In [None]:
n_classes_ori = len(list(df.species.unique()))

## Fix Meta Data
Folowing cells,
* Converts `beluga`, `globis` to `whales` for 2class label.
* Fixes Duplicate Labels.

In [None]:
# convert beluga, globis to whales
df.loc[df.species.str.contains('beluga'), 'species'] = 'beluga_whale'
df.loc[df.species.str.contains('globis'), 'species'] = 'short_finned_pilot_whale'
df.loc[df.species.str.contains('pilot_whale'), 'species'] = 'short_finned_pilot_whale'
df['class'] = df.species.map(lambda x: 'whale' if 'whale' in x else 'dolphin')

# fix duplicate labels
# https://www.kaggle.com/c/happy-whale-and-dolphin/discussion/304633
df['species'] = df['species'].str.replace('bottlenose_dolpin','bottlenose_dolphin')
df['species'] = df['species'].str.replace('kiler_whale','killer_whale')

In [None]:
df.head()

In [None]:
n_classes = len(list(df.species.unique()))

In [None]:
print(f'Num original classes: {n_classes_ori}')
print(f'Num classes after fix: {n_classes}')

## Encode labels

In [None]:
classes_encoder = LabelEncoder()
df['class_encode'] = classes_encoder.fit_transform(df['class'])
classes_list = list(df['class'].unique())
classes_encode_list = classes_encoder.transform(classes_list)
classes_unsorted_dict = {encode : name for encode, name in zip(classes_encode_list, classes_list)}
classes_dict = {}
for key, value in sorted(classes_unsorted_dict.items()):
    classes_dict[int(key)] = value

species_encoder = LabelEncoder()
df['species_encode'] = species_encoder.fit_transform(df['species'])
species_list = list(df['species'].unique())
species_encode_list = species_encoder.transform(species_list)
species_unsorted_dict = {encode : name for encode, name in zip(species_encode_list, species_list)}
species_dict = {}
for key, value in sorted(species_unsorted_dict.items()):
    species_dict[int(key)] = value

individual_encoder = LabelEncoder()
df['individual_id_encode'] = individual_encoder.fit_transform(df['individual_id'])
individual_list = list(df['individual_id'].unique())
individual_encode_list = individual_encoder.transform(individual_list)
individual_unsorted_dict = {encode : name for encode, name in zip(individual_encode_list, individual_list)}
individual_dict = {}
for key, value in sorted(individual_unsorted_dict.items()):
    individual_dict[int(key)] = value

In [None]:
df.head()

In [None]:
import json

def save_json(file, data):
    assert file.split('.')[-1] == 'json'
    with open(file, 'w') as f:
        json.dump(data, f)
        
save_json('/kaggle/working/classes.json', classes_dict)
save_json('/kaggle/working/species.json', species_dict)
save_json('/kaggle/working/individual_ids.json', individual_dict)

## Get shape information

In [None]:
%%time
def train_process(i):
    im = io.imread(f'../input/happy-whale-and-dolphin/train_images/' + df.iloc[i].image)
    shape = list(im.shape)
    return shape if len(shape) == 3 else shape + [1]
    
df['shape'] = Parallel(n_jobs=4)(delayed(train_process)(i) for i in range(len(df)))
df[['d0', 'd1', 'd2']] = pd.DataFrame(df['shape'].to_list())
df.drop(columns='shape', inplace=True)
df.to_csv('/kaggle/working/train_finetune.csv')

In [None]:
%%time
def test_process(i):
    im = io.imread('../input/happy-whale-and-dolphin/test_images/' + test_df.iloc[i].image)
    shape = list(im.shape)
    return shape if len(shape) == 3 else shape + [1]
    
test_df['shape'] = Parallel(n_jobs=4)(delayed(test_process)(i) for i in range(len(test_df)))
test_df[['d0', 'd1', 'd2']] = pd.DataFrame(test_df['shape'].to_list())
test_df.drop(columns='shape', inplace=True)
test_df.to_csv('/kaggle/working/test_finetune.csv')

## Split 5 & 10 folds

In [None]:
df_class_5f = df.copy()
skf = StratifiedKFold(n_splits=5)

for fold, ( _, val_) in enumerate(skf.split(X=df_class_5f, y=df_class_5f.class_encode)):
    df_class_5f.loc[val_ , "fold"] = fold
    
df_class_5f.to_csv('/kaggle/working/train_class_5fold.csv')

In [None]:
df_class_10f = df.copy()
skf = StratifiedKFold(n_splits=10)

for fold, ( _, val_) in enumerate(skf.split(X=df_class_10f, y=df_class_10f.class_encode)):
    df_class_10f.loc[val_ , "fold"] = fold
    
df_class_10f.to_csv('/kaggle/working/train_class_10fold.csv')

In [None]:
df_class_10f.head()

In [None]:
df_species_5f = df.copy()
skf = StratifiedKFold(n_splits=5)

for fold, ( _, val_) in enumerate(skf.split(X=df_species_5f, y=df_species_5f.species_encode)):
    df_species_5f.loc[val_ , "fold"] = fold
    
df_species_5f.to_csv('/kaggle/working/train_species_5fold.csv')

In [None]:
df_species_10f = df.copy()
skf = StratifiedKFold(n_splits=10)

for fold, ( _, val_) in enumerate(skf.split(X=df_species_10f, y=df_species_10f.species_encode)):
    df_species_10f.loc[val_ , "fold"] = fold
    
df_species_10f.to_csv('/kaggle/working/train_species_10fold.csv')

In [None]:
df_species_10f.head()

In [None]:
df_individual_5f = df.copy()
skf = StratifiedKFold(n_splits=5)

for fold, ( _, val_) in enumerate(skf.split(X=df_individual_5f, y=df_individual_5f.individual_id_encode)):
    df_individual_5f.loc[val_ , "fold"] = fold
    
df_individual_5f.to_csv('/kaggle/working/train_individual_5fold.csv')

In [None]:
df_individual_10f = df.copy()
skf = StratifiedKFold(n_splits=10)

for fold, ( _, val_) in enumerate(skf.split(X=df_individual_10f, y=df_individual_10f.individual_id_encode)):
    df_individual_10f.loc[val_ , "fold"] = fold
    
df_individual_10f.to_csv('/kaggle/working/train_individual_10fold.csv')

In [None]:
df_individual_10f.head()